diff options
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 32 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 14 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/JATS.hs | 169 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/LaTeX.hs | 30 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/RST.hs | 4 |
5 files changed, 207 insertions, 42 deletions
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 99e6f99e6..48a512be2 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -73,6 +73,7 @@ import Text.TeXMath (Exp) import Text.TeXMath.Readers.OMML (readOMML) import Text.TeXMath.Unicode.Fonts (Font (..), getUnicode, stringToFont) import Text.XML.Light +import qualified Text.XML.Light.Cursor as XMLC data ReaderEnv = ReaderEnv { envNotes :: Notes , envComments :: Comments @@ -117,6 +118,32 @@ mapD f xs = in concatMapM handler xs +unwrapSDT :: NameSpaces -> Content -> Content +unwrapSDT ns (Elem element) + | isElem ns "w" "sdt" element + , Just sdtContent <- findChildByName ns "w" "sdtContent" element + , child : _ <- elChildren sdtContent + = Elem child +unwrapSDT _ content = content + +walkDocument' :: NameSpaces -> XMLC.Cursor -> XMLC.Cursor +walkDocument' ns cur = + let modifiedCur = XMLC.modifyContent (unwrapSDT ns) cur + in + case XMLC.nextDF modifiedCur of + Just cur' -> walkDocument' ns cur' + Nothing -> XMLC.root modifiedCur + +walkDocument :: NameSpaces -> Element -> Maybe Element +walkDocument ns element = + let cur = XMLC.fromContent (Elem element) + cur' = walkDocument' ns cur + in + case XMLC.toTree cur' of + Elem element' -> Just element' + _ -> Nothing + + data Docx = Docx Document deriving Show @@ -298,7 +325,10 @@ archiveToDocument zf = do docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry let namespaces = elemToNameSpaces docElem bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem - body <- elemToBody namespaces bodyElem + let bodyElem' = case walkDocument namespaces bodyElem of + Just e -> e + Nothing -> bodyElem + body <- elemToBody namespaces bodyElem' return $ Document namespaces body elemToBody :: NameSpaces -> Element -> D Body diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 3e59c4bf7..05a80335a 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -51,7 +51,7 @@ import Data.Char (isAlphaNum, isDigit, isLetter) import Data.Default (Default (..), def) import Data.Foldable (for_) import Data.List (intercalate, isPrefixOf) -import Data.List.Split (wordsBy) +import Data.List.Split (wordsBy, splitWhen) import qualified Data.Map as M import Data.Maybe (fromMaybe, isJust, isNothing) import Data.Monoid (First (..), (<>)) @@ -66,6 +66,7 @@ import qualified Text.Pandoc.Builder as B import Text.Pandoc.Class (PandocMonad (..)) import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps) import Text.Pandoc.Definition +import Text.Pandoc.Extensions (Extension(..)) import Text.Pandoc.Error import Text.Pandoc.Logging import Text.Pandoc.Options ( @@ -191,6 +192,7 @@ block = do , pHtml , pHead , pBody + , pLineBlock , pDiv , pPlain , pFigure @@ -377,6 +379,16 @@ pRawTag = do then return mempty else return $ renderTags' [tag] +pLineBlock :: PandocMonad m => TagParser m Blocks +pLineBlock = try $ do + guardEnabled Ext_line_blocks + _ <- pSatisfy $ tagOpen (=="div") (== [("class","line-block")]) + ils <- trimInlines . mconcat <$> manyTill inline (pSatisfy (tagClose (=="div"))) + let lns = map B.fromList $ + splitWhen (== LineBreak) $ filter (/= SoftBreak) $ + B.toList ils + return $ B.lineBlock lns + pDiv :: PandocMonad m => TagParser m Blocks pDiv = try $ do guardEnabled Ext_native_divs diff --git a/src/Text/Pandoc/Readers/JATS.hs b/src/Text/Pandoc/Readers/JATS.hs index 851fbec35..9223db68c 100644 --- a/src/Text/Pandoc/Readers/JATS.hs +++ b/src/Text/Pandoc/Readers/JATS.hs @@ -5,6 +5,7 @@ import Data.Char (isDigit, isSpace, toUpper) import Data.Default import Data.Generics import Data.List (intersperse) +import qualified Data.Map as Map import Data.Maybe (maybeToList, fromMaybe) import Data.Text (Text) import qualified Data.Text as T @@ -23,7 +24,6 @@ type JATS m = StateT JATSState m data JATSState = JATSState{ jatsSectionLevel :: Int , jatsQuoteType :: QuoteType , jatsMeta :: Meta - , jatsAcceptsMeta :: Bool , jatsBook :: Bool , jatsFigureTitle :: Inlines , jatsContent :: [Content] @@ -33,7 +33,6 @@ instance Default JATSState where def = JATSState{ jatsSectionLevel = 0 , jatsQuoteType = DoubleQuote , jatsMeta = mempty - , jatsAcceptsMeta = False , jatsBook = False , jatsFigureTitle = mempty , jatsContent = [] } @@ -79,19 +78,6 @@ named s e = qName (elName e) == s -- -acceptingMetadata :: PandocMonad m => JATS m a -> JATS m a -acceptingMetadata p = do - modify (\s -> s { jatsAcceptsMeta = True } ) - res <- p - modify (\s -> s { jatsAcceptsMeta = False }) - return res - -checkInMeta :: (PandocMonad m, Monoid a) => JATS m () -> JATS m a -checkInMeta p = do - accepts <- jatsAcceptsMeta <$> get - when accepts p - return mempty - addMeta :: PandocMonad m => ToMetaValue a => String -> a -> JATS m () addMeta field val = modify (setMeta field val) @@ -179,18 +165,16 @@ parseBlock (Elem e) = <$> listitems "def-list" -> definitionList <$> deflistitems "sec" -> gets jatsSectionLevel >>= sect . (+1) - "title" -> return mempty - "title-group" -> checkInMeta getTitle "graphic" -> para <$> getGraphic e - "journal-meta" -> metaBlock - "article-meta" -> metaBlock - "custom-meta" -> metaBlock + "journal-meta" -> parseMetadata e + "article-meta" -> parseMetadata e + "custom-meta" -> parseMetadata e + "title" -> return mempty -- processed by header "table" -> parseTable "fig" -> divWith (attrValue "id" e, ["fig"], []) <$> getBlocks e "table-wrap" -> divWith (attrValue "id" e, ["table-wrap"], []) <$> getBlocks e "caption" -> divWith (attrValue "id" e, ["caption"], []) <$> sect 6 - "ref-list" -> divWith ("refs", [], []) <$> getBlocks e - "ref" -> divWith ("ref-" <> attrValue "id" e, [], []) <$> getBlocks e + "ref-list" -> parseRefList e "?xml" -> return mempty _ -> getBlocks e where parseMixed container conts = do @@ -231,16 +215,6 @@ parseBlock (Elem e) = terms' <- mapM getInlines terms items' <- mapM getBlocks items return (mconcat $ intersperse (str "; ") terms', items') - getTitle = do - tit <- case filterChild (named "article-title") e of - Just s -> getInlines s - Nothing -> return mempty - subtit <- case filterChild (named "subtitle") e of - Just s -> (text ": " <>) <$> - getInlines s - Nothing -> return mempty - addMeta "title" (tit <> subtit) - parseTable = do let isCaption x = named "title" x || named "caption" x caption <- case filterChild isCaption e of @@ -305,13 +279,127 @@ parseBlock (Elem e) = let ident = attrValue "id" e modify $ \st -> st{ jatsSectionLevel = oldN } return $ headerWith (ident,[],[]) n' headerText <> b --- lineItems = mapM getInlines $ filterChildren (named "line") e - metaBlock = acceptingMetadata (getBlocks e) >> return mempty getInlines :: PandocMonad m => Element -> JATS m Inlines getInlines e' = (trimInlines . mconcat) <$> mapM parseInline (elContent e') +parseMetadata :: PandocMonad m => Element -> JATS m Blocks +parseMetadata e = do + getTitle e + getAuthors e + getAffiliations e + return mempty + +getTitle :: PandocMonad m => Element -> JATS m () +getTitle e = do + tit <- case filterElement (named "article-title") e of + Just s -> getInlines s + Nothing -> return mempty + subtit <- case filterElement (named "subtitle") e of + Just s -> (text ": " <>) <$> + getInlines s + Nothing -> return mempty + when (tit /= mempty) $ addMeta "title" tit + when (subtit /= mempty) $ addMeta "subtitle" subtit + +getAuthors :: PandocMonad m => Element -> JATS m () +getAuthors e = do + authors <- mapM getContrib $ filterElements + (\x -> named "contrib" x && + attrValue "contrib-type" x == "author") e + authorNotes <- mapM getInlines $ filterElements (named "author-notes") e + let authors' = case (reverse authors, authorNotes) of + ([], _) -> [] + (_, []) -> authors + (a:as, ns) -> reverse as ++ [a <> mconcat ns] + unless (null authors) $ addMeta "author" authors' + +getAffiliations :: PandocMonad m => Element -> JATS m () +getAffiliations x = do + affs <- mapM getInlines $ filterChildren (named "aff") x + unless (null affs) $ addMeta "institute" affs + +getContrib :: PandocMonad m => Element -> JATS m Inlines +getContrib x = do + given <- maybe (return mempty) getInlines + $ filterElement (named "given-names") x + family <- maybe (return mempty) getInlines + $ filterElement (named "surname") x + if given == mempty && family == mempty + then return mempty + else if given == mempty || family == mempty + then return $ given <> family + else return $ given <> space <> family + +parseRefList :: PandocMonad m => Element -> JATS m Blocks +parseRefList e = do + refs <- mapM parseRef $ filterChildren (named "ref") e + addMeta "references" refs + return mempty + +parseRef :: PandocMonad m + => Element -> JATS m (Map.Map String MetaValue) +parseRef e = do + let refId = text $ attrValue "id" e + let getInlineText n = maybe (return mempty) getInlines . filterChild (named n) + case filterChild (named "element-citation") e of + Just c -> do + let refType = text $ + case attrValue "publication-type" c of + "journal" -> "article-journal" + x -> x + (refTitle, refContainerTitle) <- do + t <- getInlineText "article-title" c + ct <- getInlineText "source" c + if t == mempty + then return (ct, mempty) + else return (t, ct) + refLabel <- getInlineText "label" c + refYear <- getInlineText "year" c + refVolume <- getInlineText "volume" c + refFirstPage <- getInlineText "fpage" c + refLastPage <- getInlineText "lpage" c + refPublisher <- getInlineText "publisher-name" c + refPublisherPlace <- getInlineText "publisher-loc" c + let refPages = refFirstPage <> (if refLastPage == mempty + then mempty + else text "\x2013" <> refLastPage) + let personGroups' = filterChildren (named "person-group") c + let getName nm = do + given <- maybe (return mempty) getInlines + $ filterChild (named "given-names") nm + family <- maybe (return mempty) getInlines + $ filterChild (named "surname") nm + return $ toMetaValue $ Map.fromList [ + ("given", given) + , ("family", family) + ] + personGroups <- mapM (\pg -> + do names <- mapM getName + (filterChildren (named "name") pg) + return (attrValue "person-group-type" pg, + toMetaValue names)) + personGroups' + return $ Map.fromList $ + [ ("id", toMetaValue refId) + , ("type", toMetaValue refType) + , ("title", toMetaValue refTitle) + , ("container-title", toMetaValue refContainerTitle) + , ("publisher", toMetaValue refPublisher) + , ("publisher-place", toMetaValue refPublisherPlace) + , ("title", toMetaValue refTitle) + , ("issued", toMetaValue + $ Map.fromList [ + ("year", refYear) + ]) + , ("volume", toMetaValue refVolume) + , ("page", toMetaValue refPages) + , ("citation-label", toMetaValue refLabel) + ] ++ personGroups + Nothing -> return $ Map.insert "id" (toMetaValue refId) mempty + -- TODO handle mixed-citation + strContentRecursive :: Element -> String strContentRecursive = strContent . (\e' -> e'{ elContent = map elementToStr $ elContent e' }) @@ -354,7 +442,15 @@ parseInline (Elem e) = let rid = attrValue "rid" e let refType = ("ref-type",) <$> maybeAttrValue "ref-type" e let attr = (attrValue "id" e, [], maybeToList refType) - return $ linkWith attr ('#' : rid) "" ils + return $ if refType == Just ("ref-type","bibr") + then cite [Citation{ + citationId = rid + , citationPrefix = [] + , citationSuffix = [] + , citationMode = NormalCitation + , citationNoteNum = 0 + , citationHash = 0}] ils + else linkWith attr ('#' : rid) "" ils "ext-link" -> do ils <- innerInlines let title = fromMaybe "" $ findAttr (QName "title" (Just "http://www.w3.org/1999/xlink") Nothing) e @@ -375,9 +471,6 @@ parseInline (Elem e) = "uri" -> return $ link (strContent e) "" $ str $ strContent e "fn" -> (note . mconcat) <$> mapM parseBlock (elContent e) - -- Note: this isn't a real docbook tag; it's what we convert - -- <?asciidor-br?> to in handleInstructions, above. A kludge to - -- work around xml-light's inability to parse an instruction. _ -> innerInlines where innerInlines = (trimInlines . mconcat) <$> mapM parseInline (elContent e) diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index f7e45e01a..6c5567ffd 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -1489,8 +1489,17 @@ inlineCommands = M.union inlineLanguageCommands $ M.fromList $ -- biblatex misc , ("RN", romanNumeralUpper) , ("Rn", romanNumeralLower) + -- babel + , ("foreignlanguage", foreignlanguage) ] +foreignlanguage :: PandocMonad m => LP m Inlines +foreignlanguage = do + babelLang <- T.unpack . untokenize <$> braced + case babelLangToBCP47 babelLang of + Just lang -> spanWith ("", [], [("lang", renderLang $ lang)]) <$> tok + _ -> tok + inlineLanguageCommands :: PandocMonad m => M.Map Text (LP m Inlines) inlineLanguageCommands = M.fromList $ mk <$> M.toList polyglossiaLangToBCP47 where @@ -2655,3 +2664,24 @@ polyglossiaLangToBCP47 = M.fromList , ("urdu", \_ -> Lang "ur" "" "" []) , ("vietnamese", \_ -> Lang "vi" "" "" []) ] + +babelLangToBCP47 :: String -> Maybe Lang +babelLangToBCP47 s = + case s of + "austrian" -> Just $ Lang "de" "" "AT" ["1901"] + "naustrian" -> Just $ Lang "de" "" "AT" [] + "swissgerman" -> Just $ Lang "de" "" "CH" ["1901"] + "nswissgerman" -> Just $ Lang "de" "" "CH" [] + "german" -> Just $ Lang "de" "" "DE" ["1901"] + "ngerman" -> Just $ Lang "de" "" "DE" [] + "lowersorbian" -> Just $ Lang "dsb" "" "" [] + "uppersorbian" -> Just $ Lang "hsb" "" "" [] + "polutonikogreek" -> Just $ Lang "el" "" "" ["polyton"] + "slovene" -> Just $ Lang "sl" "" "" [] + "australian" -> Just $ Lang "en" "" "AU" [] + "canadian" -> Just $ Lang "en" "" "CA" [] + "british" -> Just $ Lang "en" "" "GB" [] + "newzealand" -> Just $ Lang "en" "" "NZ" [] + "american" -> Just $ Lang "en" "" "US" [] + "classiclatin" -> Just $ Lang "la" "" "" ["x-classic"] + _ -> fmap ($ "") $ M.lookup s polyglossiaLangToBCP47 diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs index 6b5d0a331..9f259d958 100644 --- a/src/Text/Pandoc/Readers/RST.hs +++ b/src/Text/Pandoc/Readers/RST.hs @@ -547,7 +547,7 @@ bulletListStart :: Monad m => ParserT [Char] st m Int bulletListStart = try $ do notFollowedBy' hrule -- because hrules start out just like lists marker <- oneOf bulletListMarkers - white <- many1 spaceChar + white <- many1 spaceChar <|> "" <$ lookAhead (char '\n') return $ length (marker:white) -- parses ordered list start and returns its length (inc following whitespace) @@ -556,7 +556,7 @@ orderedListStart :: Monad m => ListNumberStyle -> RSTParser m Int orderedListStart style delim = try $ do (_, markerLen) <- withHorizDisplacement (orderedListMarker style delim) - white <- many1 spaceChar + white <- many1 spaceChar <|> "" <$ lookAhead (char '\n') return $ markerLen + length white -- parse a line of a list item |