From dbf2a63669bb40e3ecfcd4ae4317d495d50ec0a2 Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Wed, 27 May 2015 09:39:05 -0700
Subject: EPUB writer: Improved chapter splitting and internal link rewriting.

Closes #1887.
Closes #2163.
Closes #2162.
---
 src/Text/Pandoc/Writers/EPUB.hs | 87 ++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 53 deletions(-)

(limited to 'src')

diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs
index 4ce7857ac..cfbec7bfa 100644
--- a/src/Text/Pandoc/Writers/EPUB.hs
+++ b/src/Text/Pandoc/Writers/EPUB.hs
@@ -56,10 +56,10 @@ import Text.Pandoc.Options ( WriterOptions(..)
                            , EPUBVersion(..)
                            , ObfuscationMethod(NoObfuscation) )
 import Text.Pandoc.Definition
-import Text.Pandoc.Walk (walk, walkM)
+import Text.Pandoc.Walk (walk, walkM, query)
 import Data.Default
 import Text.Pandoc.Writers.Markdown (writePlain)
-import Control.Monad.State (modify, get, execState, State, put, evalState)
+import Control.Monad.State (modify, get, State, put, evalState)
 import Control.Monad (mplus, liftM, when)
 import Text.XML.Light ( unode, Element(..), unqual, Attr(..), add_attrs
                       , strContent, lookupAttr, Node(..), QName(..), parseXML
@@ -408,11 +408,6 @@ writeEPUB opts doc@(Pandoc meta _) = do
                                                  (docTitle' meta) : blocks
 
   let chapterHeaderLevel = writerEpubChapterLevel opts
-  -- internal reference IDs change when we chunk the file,
-  -- so that '#my-header-1' might turn into 'chap004.xhtml#my-header'.
-  -- the next two lines fix that:
-  let reftable = correlateRefs chapterHeaderLevel blocks'
-  let blocks'' = replaceRefs reftable blocks'
 
   let isChapterHeader (Header n _ _) = n <= chapterHeaderLevel
       isChapterHeader (Div ("",["references"],[]) (Header n _ _:_)) =
@@ -443,7 +438,37 @@ writeEPUB opts doc@(Pandoc meta _) = do
         let (xs,ys) = break isChapterHeader bs
         (Chapter Nothing (b:xs) :) `fmap` toChapters ys
 
-  let chapters = evalState (toChapters blocks'') []
+  let chapters' = evalState (toChapters blocks') []
+
+  let extractLinkURL' :: Int -> Inline -> [(String, String)]
+      extractLinkURL' num (Span (ident, _, _) _)
+        | not (null ident) = [(ident, showChapter num ++ ('#':ident))]
+      extractLinkURL' _ _ = []
+
+  let extractLinkURL :: Int -> Block -> [(String, String)]
+      extractLinkURL num (Div (ident, _, _) _)
+        | not (null ident) = [(ident, showChapter num ++ ('#':ident))]
+      extractLinkURL num (Header _ (ident, _, _) _)
+        | not (null ident) = [(ident, showChapter num ++ ('#':ident))]
+      extractLinkURL num b = query (extractLinkURL' num) b
+
+  let reftable = concat $ zipWith (\(Chapter _ bs) num ->
+                                    query (extractLinkURL num) bs)
+                          chapters' [1..]
+
+  let fixInternalReferences :: Inline -> Inline
+      fixInternalReferences (Link lab ('#':xs, tit)) =
+        case lookup xs reftable of
+             Just ys ->  Link lab (ys, tit)
+             Nothing -> Link lab ('#':xs, tit)
+      fixInternalReferences x = x
+
+  -- internal reference IDs change when we chunk the file,
+  -- so that '#my-header-1' might turn into 'chap004.xhtml#my-header'.
+  -- this fixes that:
+  let chapters = map (\(Chapter mbnum bs) ->
+                         Chapter mbnum $ walk fixInternalReferences bs)
+                 chapters'
 
   let chapToEntry :: Int -> Chapter -> Entry
       chapToEntry num (Chapter mbnum bs) = mkEntry (showChapter num)
@@ -549,7 +574,7 @@ writeEPUB opts doc@(Pandoc meta _) = do
   let contentsEntry = mkEntry "content.opf" contentsData
 
   -- toc.ncx
-  let secs = hierarchicalize blocks''
+  let secs = hierarchicalize blocks'
 
   let tocLevel = writerTOCDepth opts
 
@@ -889,11 +914,6 @@ mediaTypeOf x =
     Just y | any (`isPrefixOf` y) mediaPrefixes -> Just y
     _                                           -> Nothing
 
-data IdentState = IdentState{
-       chapterNumber :: Int,
-       identTable    :: [(String,String)]
-       } deriving (Read, Show)
-
 -- Returns filename for chapter number.
 showChapter :: Int -> String
 showChapter = printf "ch%03d.xhtml"
@@ -910,45 +930,6 @@ addIdentifiers bs = evalState (mapM go bs) []
          return $ Header n (ident',classes,kvs) ils
        go x = return x
 
--- Go through a block list and construct a table
--- correlating the automatically constructed references
--- that would be used in a normal pandoc document with
--- new URLs to be used in the EPUB.  For example, what
--- was "header-1" might turn into "ch006.xhtml#header".
-correlateRefs :: Int -> [Block] -> [(String,String)]
-correlateRefs chapterHeaderLevel bs =
-  identTable $ execState (walkM goBlock bs >>= walkM goInline)
-    IdentState{ chapterNumber = 0
-              , identTable = [] }
- where goBlock :: Block -> State IdentState Block
-       goBlock x@(Header n (ident,_,_) _) = x <$ addIdentifier (Just n) ident
-       goBlock x@(Div (ident,_,_) _) = x <$ addIdentifier Nothing ident
-       goBlock x = return x
-       goInline :: Inline -> State IdentState Inline
-       goInline x@(Span (ident,_,_) _) = x <$ addIdentifier Nothing ident
-       goInline x = return x
-       addIdentifier mbHeaderLevel ident = do
-          case mbHeaderLevel of
-               Just n | n <= chapterHeaderLevel ->
-                    modify $ \s -> s{ chapterNumber = chapterNumber s + 1 }
-               _ -> return ()
-          st <- get
-          let chapterid = showChapter (chapterNumber st) ++
-                          case mbHeaderLevel of
-                               Just n | n <= chapterHeaderLevel -> ""
-                               _ -> '#' : ident
-          modify $ \s -> s{ identTable = (ident, chapterid) : identTable st }
-
--- Replace internal link references using the table produced
--- by correlateRefs.
-replaceRefs :: [(String,String)] -> [Block] -> [Block]
-replaceRefs refTable = walk replaceOneRef
-  where replaceOneRef x@(Link lab ('#':xs,tit)) =
-          case lookup xs refTable of
-                Just url -> Link lab (url,tit)
-                Nothing  -> x
-        replaceOneRef x = x
-
 -- Variant of normalizeDate that allows partial dates: YYYY, YYYY-MM
 normalizeDate' :: String -> Maybe String
 normalizeDate' xs =
-- 
cgit v1.2.3