diff options
Diffstat (limited to 'src/Text/Pandoc/Readers')
65 files changed, 4123 insertions, 3180 deletions
diff --git a/src/Text/Pandoc/Readers/BibTeX.hs b/src/Text/Pandoc/Readers/BibTeX.hs index 6c96ab30a..318afda85 100644 --- a/src/Text/Pandoc/Readers/BibTeX.hs +++ b/src/Text/Pandoc/Readers/BibTeX.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.BibTeX - Copyright : Copyright (C) 2020 John MacFarlane + Copyright : Copyright (C) 2020-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -23,41 +23,47 @@ where import Text.Pandoc.Options import Text.Pandoc.Definition import Text.Pandoc.Builder (setMeta, cite, str) -import Data.Text (Text) import Citeproc (Lang(..), parseLang) import Citeproc.Locale (getLocale) import Text.Pandoc.Error (PandocError(..)) import Text.Pandoc.Class (PandocMonad, lookupEnv) import Text.Pandoc.Citeproc.BibTeX as BibTeX import Text.Pandoc.Citeproc.MetaValue (referenceToMetaValue) +import Text.Pandoc.Sources (ToSources(..)) import Control.Monad.Except (throwError) -- | Read BibTeX from an input string and return a Pandoc document. -- The document will have only metadata, with an empty body. -- The metadata will contain a `references` field with the -- bibliography entries, and a `nocite` field with the wildcard `[@*]`. -readBibTeX :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readBibTeX :: (PandocMonad m, ToSources a) + => ReaderOptions -> a -> m Pandoc readBibTeX = readBibTeX' BibTeX.Bibtex -- | Read BibLaTeX from an input string and return a Pandoc document. -- The document will have only metadata, with an empty body. -- The metadata will contain a `references` field with the -- bibliography entries, and a `nocite` field with the wildcard `[@*]`. -readBibLaTeX :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readBibLaTeX :: (PandocMonad m, ToSources a) + => ReaderOptions -> a -> m Pandoc readBibLaTeX = readBibTeX' BibTeX.Biblatex -readBibTeX' :: PandocMonad m => Variant -> ReaderOptions -> Text -> m Pandoc +readBibTeX' :: (PandocMonad m, ToSources a) + => Variant -> ReaderOptions -> a -> m Pandoc readBibTeX' variant _opts t = do - lang <- maybe (Lang "en" (Just "US")) parseLang - <$> lookupEnv "LANG" + mblangEnv <- lookupEnv "LANG" + let defaultLang = Lang "en" Nothing (Just "US") [] [] [] + let lang = case mblangEnv of + Nothing -> defaultLang + Just l -> either (const defaultLang) id $ parseLang l locale <- case getLocale lang of Left e -> - case getLocale (Lang "en" (Just "US")) of + case getLocale (Lang "en" Nothing (Just "US") [] [] []) of Right l -> return l Left _ -> throwError $ PandocCiteprocError e Right l -> return l case BibTeX.readBibtexString variant locale (const True) t of - Left e -> throwError $ PandocParsecError t e + Left e -> throwError $ PandocParsecError (toSources t) e Right refs -> return $ setMeta "references" (map referenceToMetaValue refs) . setMeta "nocite" diff --git a/src/Text/Pandoc/Readers/CSV.hs b/src/Text/Pandoc/Readers/CSV.hs index f0edcaa16..eca8f9425 100644 --- a/src/Text/Pandoc/Readers/CSV.hs +++ b/src/Text/Pandoc/Readers/CSV.hs @@ -2,8 +2,8 @@ {-# LANGUAGE ScopedTypeVariables #-} {- | - Module : Text.Pandoc.Readers.RST - Copyright : Copyright (C) 2006-2020 John MacFarlane + Module : Text.Pandoc.Readers.CSV + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -13,23 +13,23 @@ Conversion from CSV to a 'Pandoc' table. -} module Text.Pandoc.Readers.CSV ( readCSV ) where -import Data.Text (Text) import qualified Data.Text as T import Text.Pandoc.CSV (parseCSV, defaultCSVOptions) import Text.Pandoc.Definition import qualified Text.Pandoc.Builder as B import Text.Pandoc.Class (PandocMonad) -import Text.Pandoc.Shared (crFilter) import Text.Pandoc.Error +import Text.Pandoc.Sources (ToSources(..), sourcesToText) import Text.Pandoc.Options (ReaderOptions) import Control.Monad.Except (throwError) -readCSV :: PandocMonad m +readCSV :: (PandocMonad m, ToSources a) => ReaderOptions -- ^ Reader options - -> Text -- ^ Text to parse (assuming @'\n'@ line endings) + -> a -> m Pandoc -readCSV _opts s = - case parseCSV defaultCSVOptions (crFilter s) of +readCSV _opts s = do + let txt = sourcesToText $ toSources s + case parseCSV defaultCSVOptions txt of Right (r:rs) -> return $ B.doc $ B.table capt (zip aligns widths) (TableHead nullAttr hdrs) @@ -45,4 +45,4 @@ readCSV _opts s = aligns = replicate numcols AlignDefault widths = replicate numcols ColWidthDefault Right [] -> return $ B.doc mempty - Left e -> throwError $ PandocParsecError s e + Left e -> throwError $ PandocParsecError (toSources [("",txt)]) e diff --git a/src/Text/Pandoc/Readers/CommonMark.hs b/src/Text/Pandoc/Readers/CommonMark.hs index c1773eaab..411d64278 100644 --- a/src/Text/Pandoc/Readers/CommonMark.hs +++ b/src/Text/Pandoc/Readers/CommonMark.hs @@ -1,8 +1,9 @@ {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE ScopedTypeVariables #-} {- | Module : Text.Pandoc.Readers.CommonMark - Copyright : Copyright (C) 2015-2020 John MacFarlane + Copyright : Copyright (C) 2015-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -25,17 +26,66 @@ import Text.Pandoc.Definition import Text.Pandoc.Builder as B import Text.Pandoc.Options import Text.Pandoc.Error +import Text.Pandoc.Readers.Metadata (yamlMetaBlock) import Control.Monad.Except import Data.Functor.Identity (runIdentity) +import Data.Typeable +import Text.Pandoc.Parsing (runParserT, getInput, + runF, defaultParserState, option, many1, anyChar, + Sources(..), ToSources(..), ParserT, Future, + sourceName) +import qualified Data.Text as T -- | Parse a CommonMark formatted string into a 'Pandoc' structure. -readCommonMark :: PandocMonad m => ReaderOptions -> Text -> m Pandoc -readCommonMark opts s = do - let res = runIdentity $ - commonmarkWith (foldr ($) defaultSyntaxSpec exts) "input" s - case res of - Left err -> throwError $ PandocParsecError s err - Right (Cm bls :: Cm () Blocks) -> return $ B.doc bls +readCommonMark :: (PandocMonad m, ToSources a) + => ReaderOptions -> a -> m Pandoc +readCommonMark opts s + | isEnabled Ext_yaml_metadata_block opts = do + let sources = toSources s + let toks = concatMap sourceToToks (unSources sources) + res <- runParserT (do meta <- yamlMetaBlock (metaValueParser opts) + rest <- getInput + return (meta, rest)) + defaultParserState "YAML metadata" (toSources s) + case res of + Left _ -> readCommonMarkBody opts sources toks + Right (meta, rest) -> do + -- strip off metadata section and parse body + let body = concatMap sourceToToks (unSources rest) + Pandoc _ bs <- readCommonMarkBody opts sources body + return $ Pandoc (runF meta defaultParserState) bs + | otherwise = do + let sources = toSources s + let toks = concatMap sourceToToks (unSources sources) + readCommonMarkBody opts sources toks + +sourceToToks :: (SourcePos, Text) -> [Tok] +sourceToToks (pos, s) = tokenize (sourceName pos) s + +metaValueParser :: Monad m + => ReaderOptions -> ParserT Sources st m (Future st MetaValue) +metaValueParser opts = do + inp <- option "" $ T.pack <$> many1 anyChar + let toks = concatMap sourceToToks (unSources (toSources inp)) + case runIdentity (parseCommonmarkWith (specFor opts) toks) of + Left _ -> mzero + Right (Cm bls :: Cm () Blocks) -> return $ return $ B.toMetaValue bls + +readCommonMarkBody :: PandocMonad m => ReaderOptions -> Sources -> [Tok] -> m Pandoc +readCommonMarkBody opts s toks + | isEnabled Ext_sourcepos opts = + case runIdentity (parseCommonmarkWith (specFor opts) toks) of + Left err -> throwError $ PandocParsecError s err + Right (Cm bls :: Cm SourceRange Blocks) -> return $ B.doc bls + | otherwise = + case runIdentity (parseCommonmarkWith (specFor opts) toks) of + Left err -> throwError $ PandocParsecError s err + Right (Cm bls :: Cm () Blocks) -> return $ B.doc bls + +specFor :: (Monad m, Typeable m, Typeable a, + Rangeable (Cm a Inlines), Rangeable (Cm a Blocks)) + => ReaderOptions -> SyntaxSpec m (Cm a Inlines) (Cm a Blocks) +specFor opts = foldr ($) defaultSyntaxSpec exts where exts = [ (hardLineBreaksSpec <>) | isEnabled Ext_hard_line_breaks opts ] ++ [ (smartPunctuationSpec <>) | isEnabled Ext_smart opts ] ++ @@ -62,5 +112,7 @@ readCommonMark opts s = do | isEnabled Ext_implicit_header_references opts ] ++ [ (footnoteSpec <>) | isEnabled Ext_footnotes opts ] ++ [ (definitionListSpec <>) | isEnabled Ext_definition_lists opts ] ++ - [ (taskListSpec <>) | isEnabled Ext_task_lists opts ] + [ (taskListSpec <>) | isEnabled Ext_task_lists opts ] ++ + [ (rebaseRelativePathsSpec <>) + | isEnabled Ext_rebase_relative_paths opts ] diff --git a/src/Text/Pandoc/Readers/Creole.hs b/src/Text/Pandoc/Readers/Creole.hs index 2658dfea2..ad848ada7 100644 --- a/src/Text/Pandoc/Readers/Creole.hs +++ b/src/Text/Pandoc/Readers/Creole.hs @@ -23,21 +23,20 @@ import Text.Pandoc.Class.PandocMonad (PandocMonad (..)) import Text.Pandoc.Definition import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (enclosed) -import Text.Pandoc.Shared (crFilter) - -- | Read creole from an input string and return a Pandoc document. -readCreole :: PandocMonad m +readCreole :: (PandocMonad m, ToSources a) => ReaderOptions - -> Text + -> a -> m Pandoc readCreole opts s = do - res <- readWithM parseCreole def{ stateOptions = opts } $ crFilter s <> "\n\n" + let sources = ensureFinalNewlines 2 (toSources s) + res <- readWithM parseCreole def{ stateOptions = opts } sources case res of Left e -> throwError e Right d -> return d -type CRLParser = ParserT Text ParserState +type CRLParser = ParserT Sources ParserState -- -- Utility functions diff --git a/src/Text/Pandoc/Readers/CslJson.hs b/src/Text/Pandoc/Readers/CslJson.hs index 377186b1e..a0af5c325 100644 --- a/src/Text/Pandoc/Readers/CslJson.hs +++ b/src/Text/Pandoc/Readers/CslJson.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.CslJson - Copyright : Copyright (C) 2020 John MacFarlane + Copyright : Copyright (C) 2020-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -24,21 +24,22 @@ import Text.Pandoc.Options import Text.Pandoc.Definition import Text.Pandoc.Builder (setMeta, cite, str) import qualified Text.Pandoc.UTF8 as UTF8 -import Data.Text (Text) import qualified Data.Text as T import Text.Pandoc.Error (PandocError(..)) import Text.Pandoc.Class (PandocMonad) import Text.Pandoc.Citeproc.CslJson (cslJsonToReferences) import Text.Pandoc.Citeproc.MetaValue (referenceToMetaValue) import Control.Monad.Except (throwError) +import Text.Pandoc.Sources (ToSources(..), sourcesToText) -- | Read CSL JSON from an input string and return a Pandoc document. -- The document will have only metadata, with an empty body. -- The metadata will contain a `references` field with the -- bibliography entries, and a `nocite` field with the wildcard `[@*]`. -readCslJson :: PandocMonad m => ReaderOptions -> Text -> m Pandoc -readCslJson _opts t = - case cslJsonToReferences (UTF8.fromText t) of +readCslJson :: (PandocMonad m, ToSources a) + => ReaderOptions -> a -> m Pandoc +readCslJson _opts x = + case cslJsonToReferences (UTF8.fromText $ sourcesToText $ toSources x) of Left e -> throwError $ PandocParseError $ T.pack e Right refs -> return $ setMeta "references" (map referenceToMetaValue refs) diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs index b0846e345..c49b82ccf 100644 --- a/src/Text/Pandoc/Readers/DocBook.hs +++ b/src/Text/Pandoc/Readers/DocBook.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.DocBook - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -12,23 +12,28 @@ Conversion of DocBook XML to 'Pandoc' document. -} module Text.Pandoc.Readers.DocBook ( readDocBook ) where import Control.Monad.State.Strict -import Data.Char (isSpace, toUpper) +import Data.Char (isSpace, isLetter) import Data.Default import Data.Either (rights) import Data.Foldable (asum) import Data.Generics import Data.List (intersperse,elemIndex) +import Data.List.NonEmpty (nonEmpty) import Data.Maybe (fromMaybe,mapMaybe) import Data.Text (Text) import qualified Data.Text as T +import qualified Data.Text.Lazy as TL +import Control.Monad.Except (throwError) import Text.HTML.TagSoup.Entity (lookupEntity) +import Text.Pandoc.Error (PandocError(..)) import Text.Pandoc.Builder import Text.Pandoc.Class.PandocMonad (PandocMonad, report) import Text.Pandoc.Options import Text.Pandoc.Logging (LogMessage(..)) -import Text.Pandoc.Shared (crFilter, safeRead, extractSpaces) +import Text.Pandoc.Shared (safeRead, extractSpaces) +import Text.Pandoc.Sources (ToSources(..), sourcesToText) import Text.TeXMath (readMathML, writeTeX) -import Text.XML.Light +import Text.Pandoc.XML.Light {- @@ -92,7 +97,7 @@ List of all DocBook tags, with [x] indicating implemented, [x] chapterinfo - Meta-information for a Chapter [ ] citation - An inline bibliographic reference to another published work [ ] citebiblioid - A citation of a bibliographic identifier -[ ] citerefentry - A citation to a reference page +[x] citerefentry - A citation to a reference page [ ] citetitle - The title of a cited work [ ] city - The name of a city in an address [x] classname - The name of a class, in the object-oriented programming sense @@ -129,6 +134,7 @@ List of all DocBook tags, with [x] indicating implemented, [ ] corpcredit - A corporation or organization credited in a document [ ] corpname - The name of a corporation [ ] country - The name of a country +[x] danger - An admonition set off from the text indicating hazardous situation [ ] database - The name of a database, or part of a database [x] date - The date of publication or revision of a document [ ] dedication - A wrapper for the dedication section of a book @@ -206,7 +212,7 @@ List of all DocBook tags, with [x] indicating implemented, [x] info - A wrapper for information about a component or other block. (DocBook v5) [x] informalequation - A displayed mathematical equation without a title [x] informalexample - A displayed example without a title -[ ] informalfigure - A untitled figure +[x] informalfigure - An untitled figure [ ] informaltable - A table without a title [ ] initializer - The initializer for a FieldSynopsis [x] inlineequation - A mathematical equation or expression occurring inline @@ -535,24 +541,32 @@ instance Default DBState where , dbContent = [] } -readDocBook :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readDocBook :: (PandocMonad m, ToSources a) + => ReaderOptions + -> a + -> m Pandoc readDocBook _ inp = do - let tree = normalizeTree . parseXML . handleInstructions $ crFilter inp + let sources = toSources inp + tree <- either (throwError . PandocXMLError "") return $ + parseXMLContents + (TL.fromStrict . handleInstructions . sourcesToText $ sources) (bs, st') <- flip runStateT (def{ dbContent = tree }) $ mapM parseBlock tree return $ Pandoc (dbMeta st') (toList . mconcat $ bs) --- We treat <?asciidoc-br?> specially (issue #1236), converting it --- to <br/>, since xml-light doesn't parse the instruction correctly. --- Other xml instructions are simply removed from the input stream. +-- We treat certain processing instructions by converting them to tags +-- beginning "pi-". handleInstructions :: Text -> Text -handleInstructions = T.pack . handleInstructions' . T.unpack - -handleInstructions' :: String -> String -handleInstructions' ('<':'?':'a':'s':'c':'i':'i':'d':'o':'c':'-':'b':'r':'?':'>':xs) = '<':'b':'r':'/':'>': handleInstructions' xs -handleInstructions' xs = case break (=='<') xs of - (ys, []) -> ys - ([], '<':zs) -> '<' : handleInstructions' zs - (ys, zs) -> ys ++ handleInstructions' zs +handleInstructions t = + let (x,y) = T.breakOn "<?" t + in if T.null y + then x + else + let (w,z) = T.breakOn "?>" y + in (if T.takeWhile (\c -> isLetter c || c == '-') + (T.drop 2 w) `elem` ["asciidoc-br", "dbfo"] + then x <> "<pi-" <> T.drop 2 w <> "/>" + else x <> w <> T.take 2 z) <> + handleInstructions (T.drop 2 z) getFigure :: PandocMonad m => Element -> DB m Blocks getFigure e = do @@ -565,32 +579,14 @@ getFigure e = do modify $ \st -> st{ dbFigureTitle = mempty, dbFigureId = mempty } return res --- normalize input, consolidating adjacent Text and CRef elements -normalizeTree :: [Content] -> [Content] -normalizeTree = everywhere (mkT go) - where go :: [Content] -> [Content] - go (Text (CData CDataRaw _ _):xs) = xs - go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = - Text (CData CDataText (s1 ++ s2) z):xs - go (Text (CData CDataText s1 z):CRef r:xs) = - Text (CData CDataText (s1 ++ convertEntity r) z):xs - go (CRef r:Text (CData CDataText s1 z):xs) = - Text (CData CDataText (convertEntity r ++ s1) z):xs - go (CRef r1:CRef r2:xs) = - Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs - go xs = xs - -convertEntity :: String -> String -convertEntity e = Data.Maybe.fromMaybe (map toUpper e) (lookupEntity e) - -- convenience function to get an attribute value, defaulting to "" -attrValue :: String -> Element -> Text +attrValue :: Text -> Element -> Text attrValue attr elt = - maybe "" T.pack (lookupAttrBy (\x -> qName x == attr) (elAttribs elt)) + fromMaybe "" (lookupAttrBy (\x -> qName x == attr) (elAttribs elt)) -- convenience function named :: Text -> Element -> Bool -named s e = qName (elName e) == T.unpack s +named s e = qName (elName e) == s -- @@ -605,16 +601,24 @@ addMetadataFromElement e = do Nothing -> return () Just z -> addMetaField "author" z addMetaField "subtitle" e - addMetaField "author" e + addAuthor e addMetaField "date" e addMetaField "release" e addMetaField "releaseinfo" e return mempty - where addMetaField fieldname elt = - case filterChildren (named fieldname) elt of - [] -> return () - [z] -> getInlines z >>= addMeta fieldname - zs -> mapM getInlines zs >>= addMeta fieldname + where + addAuthor elt = + case filterChildren (named "author") elt of + [] -> return () + [z] -> fromAuthor z >>= addMeta "author" + zs -> mapM fromAuthor zs >>= addMeta "author" + fromAuthor elt = + mconcat . intersperse space <$> mapM getInlines (elChildren elt) + addMetaField fieldname elt = + case filterChildren (named fieldname) elt of + [] -> return () + [z] -> getInlines z >>= addMeta fieldname + zs -> mapM getInlines zs >>= addMeta fieldname addMeta :: PandocMonad m => ToMetaValue a => Text -> a -> DB m () addMeta field val = modify (setMeta field val) @@ -627,7 +631,7 @@ isBlockElement :: Content -> Bool isBlockElement (Elem e) = qName (elName e) `elem` blockTags isBlockElement _ = False -blockTags :: [String] +blockTags :: [Text] blockTags = [ "abstract" , "ackno" @@ -669,6 +673,7 @@ blockTags = , "index" , "info" , "informalexample" + , "informalfigure" , "informaltable" , "itemizedlist" , "linegroup" @@ -713,8 +718,8 @@ blockTags = , "variablelist" ] ++ admonitionTags -admonitionTags :: [String] -admonitionTags = ["important","caution","note","tip","warning"] +admonitionTags :: [Text] +admonitionTags = ["caution","danger","important","note","tip","warning"] -- Trim leading and trailing newline characters trimNl :: Text -> Text @@ -736,9 +741,9 @@ getMediaobject e = do figTitle <- gets dbFigureTitle ident <- gets dbFigureId (imageUrl, attr) <- - case filterChild (named "imageobject") e of - Nothing -> return (mempty, nullAttr) - Just z -> case filterChild (named "imagedata") z of + case filterElements (named "imageobject") e of + [] -> return (mempty, nullAttr) + (z:_) -> case filterChild (named "imagedata") z of Nothing -> return (mempty, nullAttr) Just i -> let atVal a = attrValue a i w = case atVal "width" of @@ -771,10 +776,10 @@ getBlocks e = mconcat <$> parseBlock :: PandocMonad m => Content -> DB m Blocks parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE -parseBlock (Text (CData _ s _)) = if all isSpace s +parseBlock (Text (CData _ s _)) = if T.all isSpace s then return mempty - else return $ plain $ trimInlines $ text $ T.pack s -parseBlock (CRef x) = return $ plain $ str $ T.toUpper $ T.pack x + else return $ plain $ trimInlines $ text s +parseBlock (CRef x) = return $ plain $ str $ T.toUpper x parseBlock (Elem e) = case qName (elName e) of "toc" -> skip -- skip TOC, since in pandoc it's autogenerated @@ -829,7 +834,7 @@ parseBlock (Elem e) = "refsect2" -> sect 2 "refsect3" -> sect 3 "refsection" -> gets dbSectionLevel >>= sect . (+1) - l | l `elem` admonitionTags -> parseAdmonition $ T.pack l + l | l `elem` admonitionTags -> parseAdmonition l "area" -> skip "areaset" -> skip "areaspec" -> skip @@ -855,6 +860,7 @@ parseBlock (Elem e) = "variablelist" -> definitionList <$> deflistitems "procedure" -> bulletList <$> steps "figure" -> getFigure e + "informalfigure" -> getFigure e "mediaobject" -> para <$> getMediaobject e "caption" -> skip "info" -> addMetadataFromElement e @@ -890,7 +896,11 @@ parseBlock (Elem e) = "subtitle" -> return mempty -- handled in parent element _ -> skip >> getBlocks e where skip = do - lift $ report $ IgnoredElement $ T.pack $ qName (elName e) + let qn = qName $ elName e + let name = if "pi-" `T.isPrefixOf` qn + then "<?" <> qn <> "?>" + else qn + lift $ report $ IgnoredElement name return mempty codeBlockWithLang = do @@ -898,7 +908,7 @@ parseBlock (Elem e) = "" -> [] x -> [x] return $ codeBlockWith (attrValue "id" e, classes', []) - $ trimNl $ T.pack $ strContentRecursive e + $ trimNl $ strContentRecursive e parseBlockquote = do attrib <- case filterChild (named "attribution") e of Nothing -> return mempty @@ -952,17 +962,16 @@ parseBlock (Elem e) = w <- findAttr (unqual "colwidth") c n <- safeRead $ "0" <> T.filter (\x -> (x >= '0' && x <= '9') - || x == '.') (T.pack w) + || x == '.') w if n > 0 then Just n else Nothing - let numrows = case bodyrows of - [] -> 0 - xs -> maximum $ map length xs + let numrows = maybe 0 maximum $ nonEmpty + $ map length bodyrows let aligns = case colspecs of [] -> replicate numrows AlignDefault cs -> map toAlignment cs let parseWidth s = safeRead (T.filter (\x -> (x >= '0' && x <= '9') || x == '.') s) - let textWidth = case filterChild (named "?dbfo") e of + let textWidth = case filterChild (named "pi-dbfo") e of Just d -> case attrValue "table-width" d of "" -> 1.0 w -> fromMaybe 100.0 (parseWidth w) / 100.0 @@ -1035,12 +1044,12 @@ parseMixed container conts = do x <- parseMixed container rs return $ p <> b <> x -parseRow :: PandocMonad m => [String] -> Element -> DB m [Cell] +parseRow :: PandocMonad m => [Text] -> Element -> DB m [Cell] parseRow cn = do let isEntry x = named "entry" x || named "td" x || named "th" x mapM (parseEntry cn) . filterChildren isEntry -parseEntry :: PandocMonad m => [String] -> Element -> DB m Cell +parseEntry :: PandocMonad m => [Text] -> Element -> DB m Cell parseEntry cn el = do let colDistance sa ea = do let iStrt = elemIndex sa cn @@ -1062,7 +1071,7 @@ getInlines :: PandocMonad m => Element -> DB m Inlines getInlines e' = trimInlines . mconcat <$> mapM parseInline (elContent e') -strContentRecursive :: Element -> String +strContentRecursive :: Element -> Text strContentRecursive = strContent . (\e' -> e'{ elContent = map elementToStr $ elContent e' }) @@ -1071,16 +1080,16 @@ elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing elementToStr x = x parseInline :: PandocMonad m => Content -> DB m Inlines -parseInline (Text (CData _ s _)) = return $ text $ T.pack s +parseInline (Text (CData _ s _)) = return $ text s parseInline (CRef ref) = - return $ text $ maybe (T.toUpper $ T.pack ref) T.pack $ lookupEntity ref + return $ text $ maybe (T.toUpper ref) T.pack $ lookupEntity (T.unpack ref) parseInline (Elem e) = case qName (elName e) of "anchor" -> do return $ spanWith (attrValue "id" e, [], []) mempty "phrase" -> do let ident = attrValue "id" e - let classes = T.words $ attrValue "class" e + let classes = T.words $ attrValue "role" e if ident /= "" || classes /= [] then innerInlines (spanWith (ident,classes,[])) else innerInlines id @@ -1103,6 +1112,10 @@ parseInline (Elem e) = "segmentedlist" -> segmentedList "classname" -> codeWithLang "code" -> codeWithLang + "citerefentry" -> do + let title = maybe mempty strContent $ filterChild (named "refentrytitle") e + let manvolnum = maybe mempty (\el -> "(" <> strContent el <> ")") $ filterChild (named "manvolnum") e + return $ codeWith ("",["citerefentry"],[]) (title <> manvolnum) "filename" -> codeWithLang "envar" -> codeWithLang "literal" -> codeWithLang @@ -1125,7 +1138,7 @@ parseInline (Elem e) = "userinput" -> codeWithLang "systemitem" -> codeWithLang "varargs" -> return $ code "(...)" - "keycap" -> return (str $ T.pack $ strContent e) + "keycap" -> return (str $ strContent e) "keycombo" -> keycombo <$> mapM parseInline (elContent e) "menuchoice" -> menuchoice <$> @@ -1137,17 +1150,17 @@ parseInline (Elem e) = let title = case attrValue "endterm" e of "" -> maybe "???" xrefTitleByElem (findElementById linkend content) - endterm -> maybe "???" (T.pack . strContent) + endterm -> maybe "???" strContent (findElementById endterm content) return $ link ("#" <> linkend) "" (text title) - "email" -> return $ link ("mailto:" <> T.pack (strContent e)) "" - $ str $ T.pack $ strContent e - "uri" -> return $ link (T.pack $ strContent e) "" $ str $ T.pack $ strContent e + "email" -> return $ link ("mailto:" <> strContent e) "" + $ str $ strContent e + "uri" -> return $ link (strContent e) "" $ str $ strContent e "ulink" -> innerInlines (link (attrValue "url" e) "") "link" -> do ils <- innerInlines id let href = case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of - Just h -> T.pack h + Just h -> h _ -> "#" <> attrValue "linkend" e let ils' = if ils == mempty then str href else ils let attr = (attrValue "id" e, T.words $ attrValue "role" e, []) @@ -1163,12 +1176,15 @@ parseInline (Elem e) = "title" -> return mempty "affiliation" -> skip -- Note: this isn't a real docbook tag; it's what we convert - -- <?asciidor-br?> to in handleInstructions, above. A kludge to - -- work around xml-light's inability to parse an instruction. - "br" -> return linebreak + -- <?asciidor-br?> to in handleInstructions, above. + "pi-asciidoc-br" -> return linebreak _ -> skip >> innerInlines id where skip = do - lift $ report $ IgnoredElement $ T.pack $ qName (elName e) + let qn = qName $ elName e + let name = if "pi-" `T.isPrefixOf` qn + then "<?" <> qn <> "?>" + else qn + lift $ report $ IgnoredElement name return mempty innerInlines f = extractSpaces f . mconcat <$> @@ -1177,7 +1193,7 @@ parseInline (Elem e) = let classes' = case attrValue "language" e of "" -> [] l -> [l] - return $ codeWith (attrValue "id" e,classes',[]) $ T.pack $ strContentRecursive e + return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e simpleList = mconcat . intersperse (str "," <> space) <$> mapM getInlines (filterChildren (named "member") e) segmentedList = do @@ -1218,10 +1234,10 @@ parseInline (Elem e) = "sect5" -> descendantContent "title" el "cmdsynopsis" -> descendantContent "command" el "funcsynopsis" -> descendantContent "function" el - _ -> T.pack $ qName (elName el) ++ "_title" + _ -> qName (elName el) <> "_title" where xrefLabel = attrValue "xreflabel" el - descendantContent name = maybe "???" (T.pack . strContent) + descendantContent name = maybe "???" strContent . filterElementName (\n -> qName n == name) -- | Extract a math equation from an element @@ -1241,8 +1257,9 @@ equation e constructor = where mathMLEquations :: [Text] mathMLEquations = map writeTeX $ rights $ readMath - (\x -> qName (elName x) == "math" && qPrefix (elName x) == Just "mml") - (readMathML . T.pack . showElement) + (\x -> qName (elName x) == "math" && + qURI (elName x) == Just "http://www.w3.org/1998/Math/MathML") + (readMathML . showElement) latexEquations :: [Text] latexEquations = readMath (\x -> qName (elName x) == "mathphrase") @@ -1256,8 +1273,8 @@ equation e constructor = -- | Get the actual text stored in a CData block. 'showContent' -- returns the text still surrounded by the [[CDATA]] tags. showVerbatimCData :: Content -> Text -showVerbatimCData (Text (CData _ d _)) = T.pack d -showVerbatimCData c = T.pack $ showContent c +showVerbatimCData (Text (CData _ d _)) = d +showVerbatimCData c = showContent c -- | Set the prefix of a name to 'Nothing' diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 00de6a0cd..c06adf7e3 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -61,13 +61,14 @@ module Text.Pandoc.Readers.Docx import Codec.Archive.Zip import Control.Monad.Reader import Control.Monad.State.Strict +import Data.Bifunctor (bimap, first) import qualified Data.ByteString.Lazy as B import Data.Default (Default) -import Data.List (delete, intersect) +import Data.List (delete, intersect, foldl') import Data.Char (isSpace) import qualified Data.Map as M import qualified Data.Text as T -import Data.Maybe (isJust, fromMaybe) +import Data.Maybe (catMaybes, isJust, fromMaybe) import Data.Sequence (ViewL (..), viewl) import qualified Data.Sequence as Seq import qualified Data.Set as Set @@ -85,6 +86,7 @@ import Text.Pandoc.Class.PandocMonad (PandocMonad) import qualified Text.Pandoc.Class.PandocMonad as P import Text.Pandoc.Error import Text.Pandoc.Logging +import Data.List.NonEmpty (nonEmpty) readDocx :: PandocMonad m => ReaderOptions @@ -112,6 +114,7 @@ data DState = DState { docxAnchorMap :: M.Map T.Text T.Text -- restarting , docxListState :: M.Map (T.Text, T.Text) Integer , docxPrevPara :: Inlines + , docxTableCaptions :: [Blocks] } instance Default DState where @@ -122,6 +125,7 @@ instance Default DState where , docxDropCap = mempty , docxListState = M.empty , docxPrevPara = mempty + , docxTableCaptions = [] } data DEnv = DEnv { docxOptions :: ReaderOptions @@ -490,15 +494,32 @@ singleParaToPlain blks singleton $ Plain ils singleParaToPlain blks = blks -cellToBlocks :: PandocMonad m => Docx.Cell -> DocxContext m Blocks -cellToBlocks (Docx.Cell bps) = do +cellToCell :: PandocMonad m => RowSpan -> Docx.Cell -> DocxContext m Pandoc.Cell +cellToCell rowSpan (Docx.Cell gridSpan _ bps) = do blks <- smushBlocks <$> mapM bodyPartToBlocks bps - return $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks + let blks' = singleParaToPlain $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks + return (cell AlignDefault rowSpan (ColSpan (fromIntegral gridSpan)) blks') + +rowsToRows :: PandocMonad m => [Docx.Row] -> DocxContext m [Pandoc.Row] +rowsToRows rows = do + let rowspans = (fmap . fmap) (first RowSpan) (Docx.rowsToRowspans rows) + cells <- traverse (traverse (uncurry cellToCell)) rowspans + return (fmap (Pandoc.Row nullAttr) cells) + +splitHeaderRows :: Bool -> [Docx.Row] -> ([Docx.Row], [Docx.Row]) +splitHeaderRows hasFirstRowFormatting rs = bimap reverse reverse $ fst + $ if hasFirstRowFormatting + then foldl' f ((take 1 rs, []), True) (drop 1 rs) + else foldl' f (([], []), False) rs + where + f ((headerRows, bodyRows), previousRowWasHeader) r@(Docx.Row h cs) + | h == HasTblHeader || (previousRowWasHeader && any isContinuationCell cs) + = ((r : headerRows, bodyRows), True) + | otherwise + = ((headerRows, r : bodyRows), False) + + isContinuationCell (Docx.Cell _ vm _) = vm == Docx.Continue -rowToBlocksList :: PandocMonad m => Docx.Row -> DocxContext m [Blocks] -rowToBlocksList (Docx.Row cells) = do - blksList <- mapM cellToBlocks cells - return $ map singleParaToPlain blksList -- like trimInlines, but also take out linebreaks trimSps :: Inlines -> Inlines @@ -545,6 +566,11 @@ normalizeToClassName = T.map go . fromStyleName where go c | isSpace c = '-' | otherwise = c +bodyPartToTableCaption :: PandocMonad m => BodyPart -> DocxContext m (Maybe Blocks) +bodyPartToTableCaption (TblCaption pPr parparts) = + Just <$> bodyPartToBlocks (Paragraph pPr parparts) +bodyPartToTableCaption _ = pure Nothing + bodyPartToBlocks :: PandocMonad m => BodyPart -> DocxContext m Blocks bodyPartToBlocks (Paragraph pPr parparts) | Just True <- pBidi pPr = do @@ -636,54 +662,43 @@ bodyPartToBlocks (ListItem pPr _ _ _ parparts) = let pPr' = pPr {pStyle = constructBogusParStyleData "list-paragraph": pStyle pPr} in bodyPartToBlocks $ Paragraph pPr' parparts +bodyPartToBlocks (TblCaption _ _) = + return $ para mempty -- collected separately bodyPartToBlocks (Tbl _ _ _ []) = return $ para mempty -bodyPartToBlocks (Tbl cap _ look parts@(r:rs)) = do - let cap' = simpleCaption $ plain $ text cap - (hdr, rows) = case firstRowFormatting look of - True | null rs -> (Nothing, [r]) - | otherwise -> (Just r, rs) - False -> (Nothing, r:rs) - - cells <- mapM rowToBlocksList rows +bodyPartToBlocks (Tbl cap grid look parts) = do + captions <- gets docxTableCaptions + fullCaption <- case captions of + c : cs -> do + modify (\s -> s { docxTableCaptions = cs }) + return c + [] -> return $ if T.null cap then mempty else plain (text cap) + let shortCaption = if T.null cap then Nothing else Just (toList (text cap)) + cap' = caption shortCaption fullCaption + (hdr, rows) = splitHeaderRows (firstRowFormatting look) parts let width = maybe 0 maximum $ nonEmpty $ map rowLength parts - -- Data.List.NonEmpty is not available with ghc 7.10 so we roll out - -- our own, see - -- https://github.com/jgm/pandoc/pull/4361#issuecomment-365416155 - nonEmpty [] = Nothing - nonEmpty l = Just l rowLength :: Docx.Row -> Int - rowLength (Docx.Row c) = length c + rowLength (Docx.Row _ c) = sum (fmap (\(Docx.Cell gridSpan _ _) -> fromIntegral gridSpan) c) - let toRow = Pandoc.Row nullAttr . map simpleCell - toHeaderRow l = [toRow l | not (null l)] + headerCells <- rowsToRows hdr + bodyCells <- rowsToRows rows - -- pad cells. New Text.Pandoc.Builder will do that for us, - -- so this is for compatibility while we switch over. - let cells' = map (\row -> toRow $ take width (row ++ repeat mempty)) cells - - hdrCells <- case hdr of - Just r' -> toHeaderRow <$> rowToBlocksList r' - Nothing -> return [] - - -- The two following variables (horizontal column alignment and - -- relative column widths) go to the default at the - -- moment. Width information is in the TblGrid field of the Tbl, - -- so should be possible. Alignment might be more difficult, - -- since there doesn't seem to be a column entity in docx. + -- Horizontal column alignment goes to the default at the moment. Getting + -- it might be difficult, since there doesn't seem to be a column entity + -- in docx. let alignments = replicate width AlignDefault - widths = replicate width ColWidthDefault + totalWidth = sum grid + widths = (\w -> ColWidth (fromInteger w / fromInteger totalWidth)) <$> grid return $ table cap' (zip alignments widths) - (TableHead nullAttr hdrCells) - [TableBody nullAttr 0 [] cells'] + (TableHead nullAttr headerCells) + [TableBody nullAttr 0 [] bodyCells] (TableFoot nullAttr []) bodyPartToBlocks (OMathPara e) = return $ para $ displayMath (writeTeX e) - -- replace targets with generated anchors. rewriteLink' :: PandocMonad m => Inline -> DocxContext m Inline rewriteLink' l@(Link attr ils (T.uncons -> Just ('#',target), title)) = do @@ -719,6 +734,8 @@ bodyToOutput :: PandocMonad m => Body -> DocxContext m (Meta, [Block]) bodyToOutput (Body bps) = do let (metabps, blkbps) = sepBodyParts bps meta <- bodyPartsToMeta metabps + captions <- catMaybes <$> mapM bodyPartToTableCaption blkbps + modify (\s -> s { docxTableCaptions = captions }) blks <- smushBlocks <$> mapM bodyPartToBlocks blkbps blks' <- rewriteLinks $ blocksToDefinitions $ blocksToBullets $ toList blks blks'' <- removeOrphanAnchors blks' diff --git a/src/Text/Pandoc/Readers/Docx/Combine.hs b/src/Text/Pandoc/Readers/Docx/Combine.hs index 46112af19..6e4faa639 100644 --- a/src/Text/Pandoc/Readers/Docx/Combine.hs +++ b/src/Text/Pandoc/Readers/Docx/Combine.hs @@ -2,7 +2,7 @@ {- | Module : Text.Pandoc.Readers.Docx.Combine Copyright : © 2014-2020 Jesse Rosenthal <jrosenthal@jhu.edu>, - 2014-2020 John MacFarlane <jgm@berkeley.edu>, + 2014-2021 John MacFarlane <jgm@berkeley.edu>, 2020 Nikolay Yakimov <root@livid.pp.ru> License : GNU GPL, version 2 or above @@ -61,7 +61,7 @@ import Data.List import Data.Bifunctor import Data.Sequence ( ViewL (..), ViewR (..), viewl, viewr, spanr, spanl , (><), (|>) ) -import Text.Pandoc.Builder +import Text.Pandoc.Builder as B data Modifier a = Modifier (a -> a) | AttrModifier (Attr -> a -> a) Attr @@ -116,12 +116,12 @@ ilModifierAndInnards ils = case viewl $ unMany ils of inlinesL :: Inlines -> (Inlines, Inlines) inlinesL ils = case viewl $ unMany ils of - (s :< sq) -> (singleton s, Many sq) + (s :< sq) -> (B.singleton s, Many sq) _ -> (mempty, ils) inlinesR :: Inlines -> (Inlines, Inlines) inlinesR ils = case viewr $ unMany ils of - (sq :> s) -> (Many sq, singleton s) + (sq :> s) -> (Many sq, B.singleton s) _ -> (ils, mempty) combineInlines :: Inlines -> Inlines -> Inlines @@ -182,7 +182,7 @@ isAttrModifier _ = False smushInlines :: [Inlines] -> Inlines smushInlines xs = combineInlines xs' mempty - where xs' = foldl combineInlines mempty xs + where xs' = foldl' combineInlines mempty xs smushBlocks :: [Blocks] -> Blocks -smushBlocks xs = foldl combineBlocks mempty xs +smushBlocks xs = foldl' combineBlocks mempty xs diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index fdcffcc3f..dbb16a821 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -33,7 +33,9 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , ParStyle , CharStyle(cStyleData) , Row(..) + , TblHeader(..) , Cell(..) + , VMerge(..) , TrackedChange(..) , ChangeType(..) , ChangeInfo(..) @@ -50,6 +52,7 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , pHeading , constructBogusParStyleData , leftBiasedMergeRunStyle + , rowsToRowspans ) where import Text.Pandoc.Readers.Docx.Parse.Styles import Codec.Archive.Zip @@ -63,6 +66,7 @@ import Data.Char (chr, ord, readLitChar) import Data.List import qualified Data.Map as M import qualified Data.Text as T +import Data.Text (Text) import Data.Maybe import System.FilePath import Text.Pandoc.Readers.Docx.Util @@ -72,8 +76,7 @@ import qualified Text.Pandoc.UTF8 as UTF8 import Text.TeXMath (Exp) import Text.TeXMath.Readers.OMML (readOMML) import Text.TeXMath.Unicode.Fonts (Font (..), getUnicode, textToFont) -import Text.XML.Light -import qualified Text.XML.Light.Cursor as XMLC +import Text.Pandoc.XML.Light data ReaderEnv = ReaderEnv { envNotes :: Notes , envComments :: Comments @@ -127,37 +130,23 @@ mapD f xs = in concatMapM handler xs -unwrap :: NameSpaces -> Content -> [Content] -unwrap ns (Elem element) +unwrapElement :: NameSpaces -> Element -> [Element] +unwrapElement ns element | isElem ns "w" "sdt" element , Just sdtContent <- findChildByName ns "w" "sdtContent" element - = concatMap (unwrap ns . Elem) (elChildren sdtContent) + = concatMap (unwrapElement ns) (elChildren sdtContent) | isElem ns "w" "smartTag" element - = concatMap (unwrap ns . Elem) (elChildren element) -unwrap _ content = [content] + = concatMap (unwrapElement ns) (elChildren element) + | otherwise + = [element{ elContent = concatMap (unwrapContent ns) (elContent element) }] -unwrapChild :: NameSpaces -> Content -> Content -unwrapChild ns (Elem element) = - Elem $ element { elContent = concatMap (unwrap ns) (elContent element) } -unwrapChild _ content = content +unwrapContent :: NameSpaces -> Content -> [Content] +unwrapContent ns (Elem element) = map Elem $ unwrapElement ns element +unwrapContent _ content = [content] -walkDocument' :: NameSpaces -> XMLC.Cursor -> XMLC.Cursor -walkDocument' ns cur = - let modifiedCur = XMLC.modifyContent (unwrapChild ns) cur - in - case XMLC.nextDF modifiedCur of - Just cur' -> walkDocument' ns cur' - Nothing -> XMLC.root modifiedCur - -walkDocument :: NameSpaces -> Element -> Maybe Element +walkDocument :: NameSpaces -> Element -> Element walkDocument ns element = - let cur = XMLC.fromContent (Elem element) - cur' = walkDocument' ns cur - in - case XMLC.toTree cur' of - Elem element' -> Just element' - _ -> Nothing - + element{ elContent = concatMap (unwrapContent ns) (elContent element) } newtype Docx = Docx Document deriving Show @@ -239,6 +228,7 @@ defaultParagraphStyle = ParagraphStyle { pStyle = [] data BodyPart = Paragraph ParagraphStyle [ParPart] | ListItem ParagraphStyle T.Text T.Text (Maybe Level) [ParPart] | Tbl T.Text TblGrid TblLook [Row] + | TblCaption ParagraphStyle [ParPart] | OMathPara [Exp] deriving Show @@ -250,12 +240,61 @@ newtype TblLook = TblLook {firstRowFormatting::Bool} defaultTblLook :: TblLook defaultTblLook = TblLook{firstRowFormatting = False} -newtype Row = Row [Cell] - deriving Show +data Row = Row TblHeader [Cell] deriving Show -newtype Cell = Cell [BodyPart] +data TblHeader = HasTblHeader | NoTblHeader deriving (Show, Eq) + +data Cell = Cell GridSpan VMerge [BodyPart] deriving Show +type GridSpan = Integer + +data VMerge = Continue + -- ^ This cell should be merged with the one above it + | Restart + -- ^ This cell should not be merged with the one above it + deriving (Show, Eq) + +rowsToRowspans :: [Row] -> [[(Int, Cell)]] +rowsToRowspans rows = let + removeMergedCells = fmap (filter (\(_, Cell _ vmerge _) -> vmerge == Restart)) + in removeMergedCells (foldr f [] rows) + where + f :: Row -> [[(Int, Cell)]] -> [[(Int, Cell)]] + f (Row _ cells) acc = let + spans = g cells Nothing (listToMaybe acc) + in spans : acc + + g :: + -- | The current row + [Cell] -> + -- | Number of columns left below + Maybe Integer -> + -- | (rowspan so far, cell) for the row below this one + Maybe [(Int, Cell)] -> + -- | (rowspan so far, cell) for this row + [(Int, Cell)] + g cells _ Nothing = zip (repeat 1) cells + g cells columnsLeftBelow (Just rowBelow) = + case cells of + [] -> [] + thisCell@(Cell thisGridSpan _ _) : restOfRow -> case rowBelow of + [] -> zip (repeat 1) cells + (spanSoFarBelow, Cell gridSpanBelow vmerge _) : _ -> + let spanSoFar = case vmerge of + Restart -> 1 + Continue -> 1 + spanSoFarBelow + columnsToDrop = thisGridSpan + (gridSpanBelow - fromMaybe gridSpanBelow columnsLeftBelow) + (newColumnsLeftBelow, restOfRowBelow) = dropColumns columnsToDrop rowBelow + in (spanSoFar, thisCell) : g restOfRow (Just newColumnsLeftBelow) (Just restOfRowBelow) + + dropColumns :: Integer -> [(a, Cell)] -> (Integer, [(a, Cell)]) + dropColumns n [] = (n, []) + dropColumns n cells@((_, Cell gridSpan _ _) : otherCells) = + if n < gridSpan + then (gridSpan - n, cells) + else dropColumns (n - gridSpan) otherCells + leftBiasedMergeRunStyle :: RunStyle -> RunStyle -> RunStyle leftBiasedMergeRunStyle a b = RunStyle { isBold = isBold a <|> isBold b @@ -343,10 +382,16 @@ archiveToDocxWithWarnings archive = do Right doc -> Right (Docx doc, stateWarnings st) Left e -> Left e +parseXMLFromEntry :: Entry -> Maybe Element +parseXMLFromEntry entry = + case parseXMLElement (UTF8.toTextLazy (fromEntry entry)) of + Left _ -> Nothing + Right el -> Just el + getDocumentXmlPath :: Archive -> Maybe FilePath getDocumentXmlPath zf = do entry <- findEntryByPath "_rels/.rels" zf - relsElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + relsElem <- parseXMLFromEntry entry let rels = filterChildrenName (\n -> qName n == "Relationship") relsElem rel <- find (\e -> findAttr (QName "Type" Nothing Nothing) e == Just "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument") @@ -354,18 +399,18 @@ getDocumentXmlPath zf = do fp <- findAttr (QName "Target" Nothing Nothing) rel -- sometimes there will be a leading slash, which windows seems to -- have trouble with. - return $ case fp of + return $ case T.unpack fp of '/' : fp' -> fp' - _ -> fp + fp' -> fp' archiveToDocument :: Archive -> D Document archiveToDocument zf = do docPath <- asks envDocXmlPath entry <- maybeToD $ findEntryByPath docPath zf - docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + docElem <- maybeToD $ parseXMLFromEntry entry let namespaces = elemToNameSpaces docElem bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem - let bodyElem' = fromMaybe bodyElem (walkDocument namespaces bodyElem) + let bodyElem' = walkDocument namespaces bodyElem body <- elemToBody namespaces bodyElem' return $ Document namespaces body @@ -401,23 +446,24 @@ constructBogusParStyleData stName = ParStyle archiveToNotes :: Archive -> Notes archiveToNotes zf = let fnElem = findEntryByPath "word/footnotes.xml" zf - >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) + >>= parseXMLFromEntry enElem = findEntryByPath "word/endnotes.xml" zf - >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) - fn_namespaces = maybe [] elemToNameSpaces fnElem - en_namespaces = maybe [] elemToNameSpaces enElem - ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces - fn = fnElem >>= walkDocument ns >>= elemToNotes ns "footnote" - en = enElem >>= walkDocument ns >>= elemToNotes ns "endnote" + >>= parseXMLFromEntry + fn_namespaces = maybe mempty elemToNameSpaces fnElem + en_namespaces = maybe mempty elemToNameSpaces enElem + ns = M.union fn_namespaces en_namespaces + fn = fnElem >>= elemToNotes ns "footnote" . walkDocument ns + en = enElem >>= elemToNotes ns "endnote" . walkDocument ns in Notes ns fn en archiveToComments :: Archive -> Comments archiveToComments zf = let cmtsElem = findEntryByPath "word/comments.xml" zf - >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) - cmts_namespaces = maybe [] elemToNameSpaces cmtsElem - cmts = elemToComments cmts_namespaces <$> (cmtsElem >>= walkDocument cmts_namespaces) + >>= parseXMLFromEntry + cmts_namespaces = maybe mempty elemToNameSpaces cmtsElem + cmts = elemToComments cmts_namespaces . walkDocument cmts_namespaces <$> + cmtsElem in case cmts of Just c -> Comments cmts_namespaces c @@ -433,20 +479,26 @@ filePathToRelType path docXmlPath = then Just InDocument else Nothing -relElemToRelationship :: DocumentLocation -> Element -> Maybe Relationship -relElemToRelationship relType element | qName (elName element) == "Relationship" = +relElemToRelationship :: FilePath -> DocumentLocation -> Element + -> Maybe Relationship +relElemToRelationship fp relType element | qName (elName element) == "Relationship" = do - relId <- findAttrText (QName "Id" Nothing Nothing) element - target <- findAttrText (QName "Target" Nothing Nothing) element - return $ Relationship relType relId target -relElemToRelationship _ _ = Nothing + relId <- findAttr (QName "Id" Nothing Nothing) element + target <- findAttr (QName "Target" Nothing Nothing) element + -- target may be relative (media/image1.jpeg) or absolute + -- (/word/media/image1.jpeg); we need to relativize it (see #7374) + let frontOfFp = T.pack $ takeWhile (/= '_') fp + let target' = fromMaybe target $ + T.stripPrefix frontOfFp $ T.dropWhile (== '/') target + return $ Relationship relType relId target' +relElemToRelationship _ _ _ = Nothing filePathToRelationships :: Archive -> FilePath -> FilePath -> [Relationship] filePathToRelationships ar docXmlPath fp | Just relType <- filePathToRelType fp docXmlPath , Just entry <- findEntryByPath fp ar - , Just relElems <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry = - mapMaybe (relElemToRelationship relType) $ elChildren relElems + , Just relElems <- parseXMLFromEntry entry = + mapMaybe (relElemToRelationship fp relType) $ elChildren relElems filePathToRelationships _ _ _ = [] archiveToRelationships :: Archive -> FilePath -> [Relationship] @@ -478,10 +530,10 @@ lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do loElemToLevelOverride :: NameSpaces -> Element -> Maybe LevelOverride loElemToLevelOverride ns element | isElem ns "w" "lvlOverride" element = do - ilvl <- findAttrTextByName ns "w" "ilvl" element + ilvl <- findAttrByName ns "w" "ilvl" element let startOverride = findChildByName ns "w" "startOverride" element >>= findAttrByName ns "w" "val" - >>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)]))) + >>= stringToInteger lvl = findChildByName ns "w" "lvl" element >>= levelElemToLevel ns return $ LevelOverride ilvl startOverride lvl @@ -490,9 +542,9 @@ loElemToLevelOverride _ _ = Nothing numElemToNum :: NameSpaces -> Element -> Maybe Numb numElemToNum ns element | isElem ns "w" "num" element = do - numId <- findAttrTextByName ns "w" "numId" element + numId <- findAttrByName ns "w" "numId" element absNumId <- findChildByName ns "w" "abstractNumId" element - >>= findAttrTextByName ns "w" "val" + >>= findAttrByName ns "w" "val" let lvlOverrides = mapMaybe (loElemToLevelOverride ns) (findChildrenByName ns "w" "lvlOverride" element) @@ -502,7 +554,7 @@ numElemToNum _ _ = Nothing absNumElemToAbsNum :: NameSpaces -> Element -> Maybe AbstractNumb absNumElemToAbsNum ns element | isElem ns "w" "abstractNum" element = do - absNumId <- findAttrTextByName ns "w" "abstractNumId" element + absNumId <- findAttrByName ns "w" "abstractNumId" element let levelElems = findChildrenByName ns "w" "lvl" element levels = mapMaybe (levelElemToLevel ns) levelElems return $ AbstractNumb absNumId levels @@ -511,23 +563,23 @@ absNumElemToAbsNum _ _ = Nothing levelElemToLevel :: NameSpaces -> Element -> Maybe Level levelElemToLevel ns element | isElem ns "w" "lvl" element = do - ilvl <- findAttrTextByName ns "w" "ilvl" element + ilvl <- findAttrByName ns "w" "ilvl" element fmt <- findChildByName ns "w" "numFmt" element - >>= findAttrTextByName ns "w" "val" + >>= findAttrByName ns "w" "val" txt <- findChildByName ns "w" "lvlText" element - >>= findAttrTextByName ns "w" "val" + >>= findAttrByName ns "w" "val" let start = findChildByName ns "w" "start" element >>= findAttrByName ns "w" "val" - >>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)]))) + >>= stringToInteger return (Level ilvl fmt txt start) levelElemToLevel _ _ = Nothing archiveToNumbering' :: Archive -> Maybe Numbering archiveToNumbering' zf = case findEntryByPath "word/numbering.xml" zf of - Nothing -> Just $ Numbering [] [] [] + Nothing -> Just $ Numbering mempty [] [] Just entry -> do - numberingElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + numberingElem <- parseXMLFromEntry entry let namespaces = elemToNameSpaces numberingElem numElems = findChildrenByName namespaces "w" "num" numberingElem absNumElems = findChildrenByName namespaces "w" "abstractNum" numberingElem @@ -537,13 +589,13 @@ archiveToNumbering' zf = archiveToNumbering :: Archive -> Numbering archiveToNumbering archive = - fromMaybe (Numbering [] [] []) (archiveToNumbering' archive) + fromMaybe (Numbering mempty [] []) (archiveToNumbering' archive) -elemToNotes :: NameSpaces -> String -> Element -> Maybe (M.Map T.Text Element) +elemToNotes :: NameSpaces -> Text -> Element -> Maybe (M.Map T.Text Element) elemToNotes ns notetype element | isElem ns "w" (notetype <> "s") element = let pairs = mapMaybe - (\e -> findAttrTextByName ns "w" "id" e >>= + (\e -> findAttrByName ns "w" "id" e >>= (\a -> Just (a, e))) (findChildrenByName ns "w" notetype element) in @@ -555,7 +607,7 @@ elemToComments :: NameSpaces -> Element -> M.Map T.Text Element elemToComments ns element | isElem ns "w" "comments" element = let pairs = mapMaybe - (\e -> findAttrTextByName ns "w" "id" e >>= + (\e -> findAttrByName ns "w" "id" e >>= (\a -> Just (a, e))) (findChildrenByName ns "w" "comment" element) in @@ -570,7 +622,7 @@ elemToTblGrid :: NameSpaces -> Element -> D TblGrid elemToTblGrid ns element | isElem ns "w" "tblGrid" element = let cols = findChildrenByName ns "w" "gridCol" element in - mapD (\e -> maybeToD (findAttrByName ns "w" "val" e >>= stringToInteger)) + mapD (\e -> maybeToD (findAttrByName ns "w" "w" e >>= stringToInteger)) cols elemToTblGrid _ _ = throwError WrongElem @@ -594,14 +646,31 @@ elemToRow ns element | isElem ns "w" "tr" element = do let cellElems = findChildrenByName ns "w" "tc" element cells <- mapD (elemToCell ns) cellElems - return $ Row cells + let hasTblHeader = maybe NoTblHeader (const HasTblHeader) + (findChildByName ns "w" "trPr" element + >>= findChildByName ns "w" "tblHeader") + return $ Row hasTblHeader cells elemToRow _ _ = throwError WrongElem elemToCell :: NameSpaces -> Element -> D Cell elemToCell ns element | isElem ns "w" "tc" element = do + let properties = findChildByName ns "w" "tcPr" element + let gridSpan = properties + >>= findChildByName ns "w" "gridSpan" + >>= findAttrByName ns "w" "val" + >>= stringToInteger + let vMerge = case properties >>= findChildByName ns "w" "vMerge" of + Nothing -> Restart + Just e -> + fromMaybe Continue $ do + s <- findAttrByName ns "w" "val" e + case s of + "continue" -> Just Continue + "restart" -> Just Restart + _ -> Nothing cellContents <- mapD (elemToBodyPart ns) (elChildren element) - return $ Cell cellContents + return $ Cell (fromMaybe 1 gridSpan) vMerge cellContents elemToCell _ _ = throwError WrongElem elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation @@ -615,12 +684,12 @@ elemToParIndentation ns element | isElem ns "w" "ind" element = stringToInteger , hangingParIndent = findAttrByName ns "w" "hanging" element >>= - stringToInteger} + stringToInteger } elemToParIndentation _ _ = Nothing -testBitMask :: String -> Int -> Bool +testBitMask :: Text -> Int -> Bool testBitMask bitMaskS n = - case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of + case (reads ("0x" ++ T.unpack bitMaskS) :: [(Int, String)]) of [] -> False ((n', _) : _) -> (n' .|. n) /= 0 @@ -633,10 +702,9 @@ pNumInfo = getParStyleField numInfo . pStyle elemToBodyPart :: NameSpaces -> Element -> D BodyPart elemToBodyPart ns element | isElem ns "w" "p" element - , (c:_) <- findChildrenByName ns "m" "oMathPara" element = - do - expsLst <- eitherToD $ readOMML $ T.pack $ showElement c - return $ OMathPara expsLst + , (c:_) <- findChildrenByName ns "m" "oMathPara" element = do + expsLst <- eitherToD $ readOMML $ showElement c + return $ OMathPara expsLst elemToBodyPart ns element | isElem ns "w" "p" element , Just (numId, lvl) <- getNumInfo ns element = do @@ -654,13 +722,31 @@ elemToBodyPart ns element Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do levelInfo <- lookupLevel numId lvl <$> asks envNumbering return $ ListItem parstyle numId lvl levelInfo parparts - _ -> return $ Paragraph parstyle parparts + _ -> let + hasCaptionStyle = elem "Caption" (pStyleId <$> pStyle parstyle) + + hasSimpleTableField = fromMaybe False $ do + fldSimple <- findChildByName ns "w" "fldSimple" element + instr <- findAttrByName ns "w" "instr" fldSimple + pure ("Table" `elem` T.words instr) + + hasComplexTableField = fromMaybe False $ do + instrText <- findElementByName ns "w" "instrText" element + pure ("Table" `elem` T.words (strContent instrText)) + + in if hasCaptionStyle && (hasSimpleTableField || hasComplexTableField) + then return $ TblCaption parstyle parparts + else return $ Paragraph parstyle parparts + elemToBodyPart ns element | isElem ns "w" "tbl" element = do - let caption' = findChildByName ns "w" "tblPr" element + let tblProperties = findChildByName ns "w" "tblPr" element + caption = fromMaybe "" $ tblProperties >>= findChildByName ns "w" "tblCaption" - >>= findAttrTextByName ns "w" "val" - caption = fromMaybe "" caption' + >>= findAttrByName ns "w" "val" + description = fromMaybe "" $ tblProperties + >>= findChildByName ns "w" "tblDescription" + >>= findAttrByName ns "w" "val" grid' = case findChildByName ns "w" "tblGrid" element of Just g -> elemToTblGrid ns g Nothing -> return [] @@ -673,7 +759,7 @@ elemToBodyPart ns element grid <- grid' tblLook <- tblLook' rows <- mapD (elemToRow ns) (elChildren element) - return $ Tbl caption grid tblLook rows + return $ Tbl (caption <> description) grid tblLook rows elemToBodyPart _ _ = throwError WrongElem lookupRelationship :: DocumentLocation -> RelId -> [Relationship] -> Maybe Target @@ -698,8 +784,8 @@ getTitleAndAlt :: NameSpaces -> Element -> (T.Text, T.Text) getTitleAndAlt ns element = let mbDocPr = findChildByName ns "wp" "inline" element >>= findChildByName ns "wp" "docPr" - title = fromMaybe "" (mbDocPr >>= findAttrTextByName ns "" "title") - alt = fromMaybe "" (mbDocPr >>= findAttrTextByName ns "" "descr") + title = fromMaybe "" (mbDocPr >>= findAttrByName ns "" "title") + alt = fromMaybe "" (mbDocPr >>= findAttrByName ns "" "descr") in (title, alt) elemToParPart :: NameSpaces -> Element -> D ParPart @@ -711,22 +797,29 @@ elemToParPart ns element = let (title, alt) = getTitleAndAlt ns drawingElem a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem - >>= findAttrTextByName ns "r" "embed" + >>= findAttrByName ns "r" "embed" in case drawing of Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp title alt bs $ elemToExtent drawingElem) Nothing -> throwError WrongElem --- The below is an attempt to deal with images in deprecated vml format. +-- The two cases below are an attempt to deal with images in deprecated vml format. +-- Todo: check out title and attr for deprecated format. elemToParPart ns element | isElem ns "w" "r" element , Just _ <- findChildByName ns "w" "pict" element = let drawing = findElement (elemName ns "v" "imagedata") element - >>= findAttrTextByName ns "r" "id" + >>= findAttrByName ns "r" "id" in case drawing of - -- Todo: check out title and attr for deprecated format. Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing) Nothing -> throwError WrongElem +elemToParPart ns element + | isElem ns "w" "r" element + , Just objectElem <- findChildByName ns "w" "object" element + , Just shapeElem <- findChildByName ns "v" "shape" objectElem + , Just imagedataElem <- findChildByName ns "v" "imagedata" shapeElem + , Just drawingId <- findAttrByName ns "r" "id" imagedataElem + = expandDrawingId drawingId >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing) -- Chart elemToParPart ns element | isElem ns "w" "r" element @@ -790,7 +883,7 @@ elemToParPart ns element fldCharState <- gets stateFldCharState case fldCharState of FldCharOpen -> do - info <- eitherToD $ parseFieldInfo $ T.pack $ strContent instrText + info <- eitherToD $ parseFieldInfo $ strContent instrText modify $ \st -> st{stateFldCharState = FldCharFieldInfo info} return NullParPart _ -> return NullParPart @@ -811,48 +904,48 @@ elemToParPart ns element return $ ChangedRuns change runs elemToParPart ns element | isElem ns "w" "bookmarkStart" element - , Just bmId <- findAttrTextByName ns "w" "id" element - , Just bmName <- findAttrTextByName ns "w" "name" element = + , Just bmId <- findAttrByName ns "w" "id" element + , Just bmName <- findAttrByName ns "w" "name" element = return $ BookMark bmId bmName elemToParPart ns element | isElem ns "w" "hyperlink" element - , Just relId <- findAttrTextByName ns "r" "id" element = do + , Just relId <- findAttrByName ns "r" "id" element = do location <- asks envLocation runs <- mapD (elemToRun ns) (elChildren element) rels <- asks envRelationships case lookupRelationship location relId rels of Just target -> - case findAttrTextByName ns "w" "anchor" element of + case findAttrByName ns "w" "anchor" element of Just anchor -> return $ ExternalHyperLink (target <> "#" <> anchor) runs Nothing -> return $ ExternalHyperLink target runs Nothing -> return $ ExternalHyperLink "" runs elemToParPart ns element | isElem ns "w" "hyperlink" element - , Just anchor <- findAttrTextByName ns "w" "anchor" element = do + , Just anchor <- findAttrByName ns "w" "anchor" element = do runs <- mapD (elemToRun ns) (elChildren element) return $ InternalHyperLink anchor runs elemToParPart ns element | isElem ns "w" "commentRangeStart" element - , Just cmtId <- findAttrTextByName ns "w" "id" element = do + , Just cmtId <- findAttrByName ns "w" "id" element = do (Comments _ commentMap) <- asks envComments case M.lookup cmtId commentMap of Just cmtElem -> elemToCommentStart ns cmtElem Nothing -> throwError WrongElem elemToParPart ns element | isElem ns "w" "commentRangeEnd" element - , Just cmtId <- findAttrTextByName ns "w" "id" element = + , Just cmtId <- findAttrByName ns "w" "id" element = return $ CommentEnd cmtId elemToParPart ns element | isElem ns "m" "oMath" element = - fmap PlainOMath (eitherToD $ readOMML $ T.pack $ showElement element) + fmap PlainOMath (eitherToD $ readOMML $ showElement element) elemToParPart _ _ = throwError WrongElem elemToCommentStart :: NameSpaces -> Element -> D ParPart elemToCommentStart ns element | isElem ns "w" "comment" element - , Just cmtId <- findAttrTextByName ns "w" "id" element - , Just cmtAuthor <- findAttrTextByName ns "w" "author" element - , cmtDate <- findAttrTextByName ns "w" "date" element = do + , Just cmtId <- findAttrByName ns "w" "id" element + , Just cmtAuthor <- findAttrByName ns "w" "author" element + , cmtDate <- findAttrByName ns "w" "date" element = do bps <- mapD (elemToBodyPart ns) (elChildren element) return $ CommentStart cmtId cmtAuthor cmtDate bps elemToCommentStart _ _ = throwError WrongElem @@ -871,7 +964,7 @@ elemToExtent drawingElem = where wp_ns = "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" getDim at = findElement (QName "extent" (Just wp_ns) (Just "wp")) drawingElem - >>= findAttr (QName at Nothing Nothing) >>= safeRead . T.pack + >>= findAttr (QName at Nothing Nothing) >>= safeRead childElemToRun :: NameSpaces -> Element -> D Run @@ -882,7 +975,7 @@ childElemToRun ns element = let (title, alt) = getTitleAndAlt ns element a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem - >>= findAttrText (QName "embed" (lookup "r" ns) (Just "r")) + >>= findAttr (QName "embed" (M.lookup "r" ns) (Just "r")) in case drawing of Just s -> expandDrawingId s >>= @@ -895,7 +988,7 @@ childElemToRun ns element = return InlineChart childElemToRun ns element | isElem ns "w" "footnoteReference" element - , Just fnId <- findAttrTextByName ns "w" "id" element = do + , Just fnId <- findAttrByName ns "w" "id" element = do notes <- asks envNotes case lookupFootnote fnId notes of Just e -> do bps <- local (\r -> r {envLocation=InFootnote}) $ mapD (elemToBodyPart ns) (elChildren e) @@ -903,7 +996,7 @@ childElemToRun ns element Nothing -> return $ Footnote [] childElemToRun ns element | isElem ns "w" "endnoteReference" element - , Just enId <- findAttrTextByName ns "w" "id" element = do + , Just enId <- findAttrByName ns "w" "id" element = do notes <- asks envNotes case lookupEndnote enId notes of Just e -> do bps <- local (\r -> r {envLocation=InEndnote}) $ mapD (elemToBodyPart ns) (elChildren e) @@ -956,15 +1049,15 @@ getParStyleField _ _ = Nothing getTrackedChange :: NameSpaces -> Element -> Maybe TrackedChange getTrackedChange ns element | isElem ns "w" "ins" element || isElem ns "w" "moveTo" element - , Just cId <- findAttrTextByName ns "w" "id" element - , Just cAuthor <- findAttrTextByName ns "w" "author" element - , mcDate <- findAttrTextByName ns "w" "date" element = + , Just cId <- findAttrByName ns "w" "id" element + , Just cAuthor <- findAttrByName ns "w" "author" element + , mcDate <- findAttrByName ns "w" "date" element = Just $ TrackedChange Insertion (ChangeInfo cId cAuthor mcDate) getTrackedChange ns element | isElem ns "w" "del" element || isElem ns "w" "moveFrom" element - , Just cId <- findAttrTextByName ns "w" "id" element - , Just cAuthor <- findAttrTextByName ns "w" "author" element - , mcDate <- findAttrTextByName ns "w" "date" element = + , Just cId <- findAttrByName ns "w" "id" element + , Just cAuthor <- findAttrByName ns "w" "author" element + , mcDate <- findAttrByName ns "w" "date" element = Just $ TrackedChange Deletion (ChangeInfo cId cAuthor mcDate) getTrackedChange _ _ = Nothing @@ -973,7 +1066,7 @@ elemToParagraphStyle ns element sty | Just pPr <- findChildByName ns "w" "pPr" element = let style = mapMaybe - (fmap ParaStyleId . findAttrTextByName ns "w" "val") + (fmap ParaStyleId . findAttrByName ns "w" "val") (findChildrenByName ns "w" "pStyle" pPr) in ParagraphStyle {pStyle = mapMaybe (`M.lookup` sty) style @@ -1005,7 +1098,7 @@ elemToRunStyleD ns element charStyles <- asks envCharStyles let parentSty = findChildByName ns "w" "rStyle" rPr >>= - findAttrTextByName ns "w" "val" >>= + findAttrByName ns "w" "val" >>= flip M.lookup charStyles . CharStyleId return $ elemToRunStyle ns element parentSty elemToRunStyleD _ _ = return defaultRunStyle @@ -1015,7 +1108,7 @@ elemToRunElem ns element | isElem ns "w" "t" element || isElem ns "w" "delText" element || isElem ns "m" "t" element = do - let str = T.pack $ strContent element + let str = strContent element font <- asks envFont case font of Nothing -> return $ TextRun str @@ -1037,14 +1130,14 @@ getSymChar :: NameSpaces -> Element -> RunElem getSymChar ns element | Just s <- lowerFromPrivate <$> getCodepoint , Just font <- getFont = - case readLitChar ("\\x" ++ s) of + case readLitChar ("\\x" ++ T.unpack s) of [(char, _)] -> TextRun . maybe "" T.singleton $ getUnicode font char _ -> TextRun "" where getCodepoint = findAttrByName ns "w" "char" element - getFont = textToFont . T.pack =<< findAttrByName ns "w" "font" element - lowerFromPrivate ('F':xs) = '0':xs - lowerFromPrivate xs = xs + getFont = textToFont =<< findAttrByName ns "w" "font" element + lowerFromPrivate t | "F" `T.isPrefixOf` t = "0" <> T.drop 1 t + | otherwise = t getSymChar _ _ = TextRun "" elemToRunElems :: NameSpaces -> Element -> D [RunElem] @@ -1054,8 +1147,9 @@ elemToRunElems ns element let qualName = elemName ns "w" let font = do fontElem <- findElement (qualName "rFonts") element - textToFont . T.pack =<< - foldr ((<|>) . (flip findAttr fontElem . qualName)) Nothing ["ascii", "hAnsi"] + textToFont =<< + foldr ((<|>) . (flip findAttr fontElem . qualName)) + Nothing ["ascii", "hAnsi"] local (setFont font) (mapD (elemToRunElem ns) (elChildren element)) elemToRunElems _ _ = throwError WrongElem diff --git a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs index 236167187..0d7271d6a 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs @@ -48,11 +48,13 @@ import Data.Function (on) import Data.String (IsString(..)) import qualified Data.Map as M import qualified Data.Text as T +import qualified Data.Text.Read +import Data.Text (Text) import Data.Maybe import Data.Coerce import Text.Pandoc.Readers.Docx.Util import qualified Text.Pandoc.UTF8 as UTF8 -import Text.XML.Light +import Text.Pandoc.XML.Light newtype CharStyleId = CharStyleId T.Text deriving (Show, Eq, Ord, IsString, FromStyleId) @@ -108,7 +110,7 @@ data RunStyle = RunStyle { isBold :: Maybe Bool , isRTL :: Maybe Bool , isForceCTL :: Maybe Bool , rVertAlign :: Maybe VertAlign - , rUnderline :: Maybe String + , rUnderline :: Maybe Text , rParentStyle :: Maybe CharStyle } deriving Show @@ -135,19 +137,22 @@ defaultRunStyle = RunStyle { isBold = Nothing , rParentStyle = Nothing } -archiveToStyles' :: (Ord k1, Ord k2, ElemToStyle a1, ElemToStyle a2) => - (a1 -> k1) -> (a2 -> k2) -> Archive -> (M.Map k1 a1, M.Map k2 a2) +archiveToStyles' + :: (Ord k1, Ord k2, ElemToStyle a1, ElemToStyle a2) + => (a1 -> k1) -> (a2 -> k2) -> Archive -> (M.Map k1 a1, M.Map k2 a2) archiveToStyles' conv1 conv2 zf = - let stylesElem = findEntryByPath "word/styles.xml" zf >>= - (parseXMLDoc . UTF8.toStringLazy . fromEntry) - in - case stylesElem of - Nothing -> (M.empty, M.empty) - Just styElem -> - let namespaces = elemToNameSpaces styElem - in - ( M.fromList $ map (\r -> (conv1 r, r)) $ buildBasedOnList namespaces styElem Nothing, - M.fromList $ map (\p -> (conv2 p, p)) $ buildBasedOnList namespaces styElem Nothing) + case findEntryByPath "word/styles.xml" zf of + Nothing -> (M.empty, M.empty) + Just entry -> + case parseXMLElement . UTF8.toTextLazy . fromEntry $ entry of + Left _ -> (M.empty, M.empty) + Right styElem -> + let namespaces = elemToNameSpaces styElem + in + ( M.fromList $ map (\r -> (conv1 r, r)) $ + buildBasedOnList namespaces styElem Nothing, + M.fromList $ map (\p -> (conv2 p, p)) $ + buildBasedOnList namespaces styElem Nothing) isBasedOnStyle :: (ElemToStyle a, FromStyleId (StyleId a)) => NameSpaces -> Element -> Maybe a -> Bool isBasedOnStyle ns element parentStyle @@ -155,7 +160,7 @@ isBasedOnStyle ns element parentStyle , Just styleType <- findAttrByName ns "w" "type" element , styleType == cStyleType parentStyle , Just basedOnVal <- findChildByName ns "w" "basedOn" element >>= - findAttrTextByName ns "w" "val" + findAttrByName ns "w" "val" , Just ps <- parentStyle = basedOnVal == fromStyleId (getStyleId ps) | isElem ns "w" "style" element , Just styleType <- findAttrByName ns "w" "type" element @@ -165,7 +170,7 @@ isBasedOnStyle ns element parentStyle | otherwise = False class HasStyleId a => ElemToStyle a where - cStyleType :: Maybe a -> String + cStyleType :: Maybe a -> Text elemToStyle :: NameSpaces -> Element -> Maybe a -> Maybe a class FromStyleId (StyleId a) => HasStyleId a where @@ -222,8 +227,10 @@ buildBasedOnList ns element rootStyle = stys -> stys ++ concatMap (buildBasedOnList ns element . Just) stys -stringToInteger :: String -> Maybe Integer -stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)]) +stringToInteger :: Text -> Maybe Integer +stringToInteger s = case Data.Text.Read.decimal s of + Right (x,_) -> Just x + Left _ -> Nothing checkOnOff :: NameSpaces -> Element -> QName -> Maybe Bool checkOnOff ns rPr tag @@ -243,7 +250,7 @@ checkOnOff _ _ _ = Nothing elemToCharStyle :: NameSpaces -> Element -> Maybe CharStyle -> Maybe CharStyle elemToCharStyle ns element parentStyle - = CharStyle <$> (CharStyleId <$> findAttrTextByName ns "w" "styleId" element) + = CharStyle <$> (CharStyleId <$> findAttrByName ns "w" "styleId" element) <*> getElementStyleName ns element <*> Just (elemToRunStyle ns element parentStyle) @@ -277,7 +284,7 @@ elemToRunStyle _ _ _ = defaultRunStyle getHeaderLevel :: NameSpaces -> Element -> Maybe (ParaStyleName, Int) getHeaderLevel ns element | Just styleName <- getElementStyleName ns element - , Just n <- stringToInteger . T.unpack =<< + , Just n <- stringToInteger =<< (T.stripPrefix "heading " . T.toLower $ fromStyleName styleName) , n > 0 = Just (styleName, fromInteger n) @@ -285,8 +292,8 @@ getHeaderLevel _ _ = Nothing getElementStyleName :: Coercible T.Text a => NameSpaces -> Element -> Maybe a getElementStyleName ns el = coerce <$> - ((findChildByName ns "w" "name" el >>= findAttrTextByName ns "w" "val") - <|> findAttrTextByName ns "w" "styleId" el) + ((findChildByName ns "w" "name" el >>= findAttrByName ns "w" "val") + <|> findAttrByName ns "w" "styleId" el) getNumInfo :: NameSpaces -> Element -> Maybe (T.Text, T.Text) getNumInfo ns element = do @@ -294,15 +301,15 @@ getNumInfo ns element = do findChildByName ns "w" "numPr" lvl = fromMaybe "0" (numPr >>= findChildByName ns "w" "ilvl" >>= - findAttrTextByName ns "w" "val") + findAttrByName ns "w" "val") numId <- numPr >>= findChildByName ns "w" "numId" >>= - findAttrTextByName ns "w" "val" + findAttrByName ns "w" "val" return (numId, lvl) elemToParStyleData :: NameSpaces -> Element -> Maybe ParStyle -> Maybe ParStyle elemToParStyleData ns element parentStyle - | Just styleId <- findAttrTextByName ns "w" "styleId" element + | Just styleId <- findAttrByName ns "w" "styleId" element , Just styleName <- getElementStyleName ns element = Just $ ParStyle { diff --git a/src/Text/Pandoc/Readers/Docx/Util.hs b/src/Text/Pandoc/Readers/Docx/Util.hs index a573344ff..970697a2d 100644 --- a/src/Text/Pandoc/Readers/Docx/Util.hs +++ b/src/Text/Pandoc/Readers/Docx/Util.hs @@ -1,7 +1,8 @@ +{-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Docx.StyleMaps Copyright : © 2014-2020 Jesse Rosenthal <jrosenthal@jhu.edu>, - 2014-2020 John MacFarlane <jgm@berkeley.edu>, + 2014-2021 John MacFarlane <jgm@berkeley.edu>, 2015 Nikolay Yakimov <root@livid.pp.ru> License : GNU GPL, version 2 or above @@ -18,51 +19,52 @@ module Text.Pandoc.Readers.Docx.Util ( , elemToNameSpaces , findChildByName , findChildrenByName - , findAttrText + , findElementByName , findAttrByName - , findAttrTextByName ) where -import Data.Maybe (mapMaybe) import qualified Data.Text as T -import Text.XML.Light +import Data.Text (Text) +import Text.Pandoc.XML.Light +import qualified Data.Map as M -type NameSpaces = [(String, String)] +type NameSpaces = M.Map Text Text elemToNameSpaces :: Element -> NameSpaces -elemToNameSpaces = mapMaybe attrToNSPair . elAttribs +elemToNameSpaces = foldr (\(Attr qn val) -> + case qn of + QName s _ (Just "xmlns") -> M.insert s val + _ -> id) mempty . elAttribs -attrToNSPair :: Attr -> Maybe (String, String) -attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val) -attrToNSPair _ = Nothing - -elemName :: NameSpaces -> String -> String -> QName +elemName :: NameSpaces -> Text -> Text -> QName elemName ns prefix name = - QName name (lookup prefix ns) (if null prefix then Nothing else Just prefix) + QName name (M.lookup prefix ns) + (if T.null prefix then Nothing else Just prefix) -isElem :: NameSpaces -> String -> String -> Element -> Bool +isElem :: NameSpaces -> Text -> Text -> Element -> Bool isElem ns prefix name element = - let ns' = ns ++ elemToNameSpaces element + let ns' = ns <> elemToNameSpaces element in qName (elName element) == name && - qURI (elName element) == lookup prefix ns' + qURI (elName element) == M.lookup prefix ns' -findChildByName :: NameSpaces -> String -> String -> Element -> Maybe Element +findChildByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element findChildByName ns pref name el = - let ns' = ns ++ elemToNameSpaces el + let ns' = ns <> elemToNameSpaces el in findChild (elemName ns' pref name) el -findChildrenByName :: NameSpaces -> String -> String -> Element -> [Element] +findChildrenByName :: NameSpaces -> Text -> Text -> Element -> [Element] findChildrenByName ns pref name el = - let ns' = ns ++ elemToNameSpaces el + let ns' = ns <> elemToNameSpaces el in findChildren (elemName ns' pref name) el -findAttrText :: QName -> Element -> Maybe T.Text -findAttrText x = fmap T.pack . findAttr x +-- | Like 'findChildrenByName', but searches descendants. +findElementByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element +findElementByName ns pref name el = + let ns' = ns <> elemToNameSpaces el + in findElement (elemName ns' pref name) el -findAttrByName :: NameSpaces -> String -> String -> Element -> Maybe String +findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text findAttrByName ns pref name el = - let ns' = ns ++ elemToNameSpaces el + let ns' = ns <> elemToNameSpaces el in findAttr (elemName ns' pref name) el -findAttrTextByName :: NameSpaces -> String -> String -> Element -> Maybe T.Text -findAttrTextByName a b c = fmap T.pack . findAttrByName a b c diff --git a/src/Text/Pandoc/Readers/DokuWiki.hs b/src/Text/Pandoc/Readers/DokuWiki.hs index 336be09e5..db98ac8de 100644 --- a/src/Text/Pandoc/Readers/DokuWiki.hs +++ b/src/Text/Pandoc/Readers/DokuWiki.hs @@ -29,26 +29,27 @@ import Text.Pandoc.Definition import Text.Pandoc.Error (PandocError (PandocParsecError)) import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (enclosed, nested) -import Text.Pandoc.Shared (crFilter, trim, stringify, tshow) +import Text.Pandoc.Shared (trim, stringify, tshow) -- | Read DokuWiki from an input string and return a Pandoc document. -readDokuWiki :: PandocMonad m +readDokuWiki :: (PandocMonad m, ToSources a) => ReaderOptions - -> Text + -> a -> m Pandoc readDokuWiki opts s = do - let input = crFilter s - res <- runParserT parseDokuWiki def {stateOptions = opts } "source" input + let sources = toSources s + res <- runParserT parseDokuWiki def {stateOptions = opts } + (initialSourceName sources) sources case res of - Left e -> throwError $ PandocParsecError input e + Left e -> throwError $ PandocParsecError sources e Right d -> return d -type DWParser = ParserT Text ParserState +type DWParser = ParserT Sources ParserState -- * Utility functions -- | Parse end-of-line, which can be either a newline or end-of-file. -eol :: Stream s m Char => ParserT s st m () +eol :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m () eol = void newline <|> eof nested :: PandocMonad m => DWParser m a -> DWParser m a @@ -317,7 +318,7 @@ interwikiToUrl "wpes" page = "https://es.wikipedia.org/wiki/" <> page interwikiToUrl "wpfr" page = "https://fr.wikipedia.org/wiki/" <> page interwikiToUrl "wpjp" page = "https://jp.wikipedia.org/wiki/" <> page interwikiToUrl "wppl" page = "https://pl.wikipedia.org/wiki/" <> page -interwikiToUrl _ page = "https://www.google.com/search?q=" <> page <> "&btnI=lucky" +interwikiToUrl unknown page = unknown <> ">" <> page linkText :: PandocMonad m => DWParser m B.Inlines linkText = parseLink fromRaw "[[" "]]" diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs index 5e3326e6d..eb8d2405d 100644 --- a/src/Text/Pandoc/Readers/EPUB.hs +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -17,14 +17,14 @@ module Text.Pandoc.Readers.EPUB (readEPUB) where -import Codec.Archive.Zip (Archive (..), Entry, findEntryByPath, fromEntry, +import Codec.Archive.Zip (Archive (..), Entry(..), findEntryByPath, fromEntry, toArchiveOrFail) import Control.DeepSeq (NFData, deepseq) import Control.Monad (guard, liftM, liftM2, mplus) import Control.Monad.Except (throwError) import qualified Data.ByteString.Lazy as BL (ByteString) -import Data.List (isInfixOf) import qualified Data.Text as T +import Data.Text (Text) import qualified Data.Map as M (Map, elems, fromList, lookup) import Data.Maybe (mapMaybe) import qualified Data.Text.Lazy as TL @@ -40,12 +40,12 @@ import Text.Pandoc.Extensions (Extension (Ext_raw_html), enableExtension) import Text.Pandoc.MIME (MimeType) import Text.Pandoc.Options (ReaderOptions (..)) import Text.Pandoc.Readers.HTML (readHtml) -import Text.Pandoc.Shared (addMetaField, collapseFilePath, escapeURI) -import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy) +import Text.Pandoc.Shared (addMetaField, collapseFilePath, escapeURI, tshow) +import qualified Text.Pandoc.UTF8 as UTF8 (toTextLazy) import Text.Pandoc.Walk (query, walk) -import Text.XML.Light +import Text.Pandoc.XML.Light -type Items = M.Map String (FilePath, MimeType) +type Items = M.Map Text (FilePath, MimeType) readEPUB :: PandocMonad m => ReaderOptions -> BL.ByteString -> m Pandoc readEPUB opts bytes = case toArchiveOrFail bytes of @@ -125,26 +125,27 @@ imageToPandoc s = B.doc . B.para $ B.image (T.pack s) "" mempty imageMimes :: [MimeType] imageMimes = ["image/gif", "image/jpeg", "image/png"] -type CoverId = String +type CoverId = Text type CoverImage = FilePath -parseManifest :: (PandocMonad m) => Element -> Maybe CoverId -> m (Maybe CoverImage, Items) +parseManifest :: (PandocMonad m) + => Element -> Maybe CoverId -> m (Maybe CoverImage, Items) parseManifest content coverId = do manifest <- findElementE (dfName "manifest") content let items = findChildren (dfName "item") manifest r <- mapM parseItem items let cover = findAttr (emptyName "href") =<< filterChild findCover manifest - return (cover `mplus` coverId, M.fromList r) + return (T.unpack <$> (cover `mplus` coverId), M.fromList r) where - findCover e = maybe False (isInfixOf "cover-image") + findCover e = maybe False (T.isInfixOf "cover-image") (findAttr (emptyName "properties") e) || Just True == liftM2 (==) coverId (findAttr (emptyName "id") e) parseItem e = do uid <- findAttrE (emptyName "id") e href <- findAttrE (emptyName "href") e mime <- findAttrE (emptyName "media-type") e - return (uid, (href, T.pack mime)) + return (uid, (T.unpack href, mime)) parseSpine :: PandocMonad m => Items -> Element -> m [(FilePath, MimeType)] parseSpine is e = do @@ -172,25 +173,25 @@ parseMeta content = do -- http://www.idpf.org/epub/30/spec/epub30-publications.html#sec-metadata-elem parseMetaItem :: Element -> Meta -> Meta parseMetaItem e@(stripNamespace . elName -> field) meta = - addMetaField (renameMeta field) (B.str $ T.pack $ strContent e) meta + addMetaField (renameMeta field) (B.str $ strContent e) meta -renameMeta :: String -> T.Text +renameMeta :: Text -> Text renameMeta "creator" = "author" -renameMeta s = T.pack s +renameMeta s = s getManifest :: PandocMonad m => Archive -> m (String, Element) getManifest archive = do metaEntry <- findEntryByPathE ("META-INF" </> "container.xml") archive - docElem <- (parseXMLDocE . UTF8.toStringLazy . fromEntry) metaEntry + docElem <- parseXMLDocE metaEntry let namespaces = mapMaybe attrToNSPair (elAttribs docElem) ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces) as <- fmap (map attrToPair . elAttribs) (findElementE (QName "rootfile" (Just ns) Nothing) docElem) - manifestFile <- mkE "Root not found" (lookup "full-path" as) + manifestFile <- T.unpack <$> mkE "Root not found" (lookup "full-path" as) let rootdir = dropFileName manifestFile --mime <- lookup "media-type" as manifest <- findEntryByPathE manifestFile archive - (rootdir,) <$> (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest) + (rootdir,) <$> parseXMLDocE manifest -- Fixup @@ -200,7 +201,8 @@ fixInternalReferences pathToFile = . walk (fixBlockIRs filename) . walk (fixInlineIRs filename) where - (root, T.unpack . escapeURI . T.pack -> filename) = splitFileName pathToFile + (root, T.unpack . escapeURI . T.pack -> filename) = + splitFileName pathToFile fixInlineIRs :: String -> Inline -> Inline fixInlineIRs s (Span as v) = @@ -213,7 +215,7 @@ fixInlineIRs s (Link as is t) = Link (fixAttrs s as) is t fixInlineIRs _ v = v -prependHash :: [T.Text] -> Inline -> Inline +prependHash :: [Text] -> Inline -> Inline prependHash ps l@(Link attr is (url, tit)) | or [s `T.isPrefixOf` url | s <- ps] = Link attr is ("#" <> url, tit) @@ -230,16 +232,17 @@ fixBlockIRs s (CodeBlock as code) = fixBlockIRs _ b = b fixAttrs :: FilePath -> B.Attr -> B.Attr -fixAttrs s (ident, cs, kvs) = (addHash s ident, filter (not . T.null) cs, removeEPUBAttrs kvs) +fixAttrs s (ident, cs, kvs) = + (addHash s ident, filter (not . T.null) cs, removeEPUBAttrs kvs) -addHash :: String -> T.Text -> T.Text +addHash :: FilePath -> Text -> Text addHash _ "" = "" addHash s ident = T.pack (takeFileName s) <> "#" <> ident -removeEPUBAttrs :: [(T.Text, T.Text)] -> [(T.Text, T.Text)] +removeEPUBAttrs :: [(Text, Text)] -> [(Text, Text)] removeEPUBAttrs kvs = filter (not . isEPUBAttr) kvs -isEPUBAttr :: (T.Text, a) -> Bool +isEPUBAttr :: (Text, a) -> Bool isEPUBAttr (k, _) = "epub:" `T.isPrefixOf` k -- Library @@ -256,39 +259,44 @@ uncurry3 f (a, b, c) = f a b c -- Utility -stripNamespace :: QName -> String +stripNamespace :: QName -> Text stripNamespace (QName v _ _) = v -attrToNSPair :: Attr -> Maybe (String, String) +attrToNSPair :: Attr -> Maybe (Text, Text) attrToNSPair (Attr (QName "xmlns" _ _) val) = Just ("xmlns", val) attrToNSPair _ = Nothing -attrToPair :: Attr -> (String, String) +attrToPair :: Attr -> (Text, Text) attrToPair (Attr (QName name _ _) val) = (name, val) -defaultNameSpace :: Maybe String +defaultNameSpace :: Maybe Text defaultNameSpace = Just "http://www.idpf.org/2007/opf" -dfName :: String -> QName +dfName :: Text -> QName dfName s = QName s defaultNameSpace Nothing -emptyName :: String -> QName +emptyName :: Text -> QName emptyName s = QName s Nothing Nothing -- Convert Maybe interface to Either -findAttrE :: PandocMonad m => QName -> Element -> m String +findAttrE :: PandocMonad m => QName -> Element -> m Text findAttrE q e = mkE "findAttr" $ findAttr q e findEntryByPathE :: PandocMonad m => FilePath -> Archive -> m Entry findEntryByPathE (normalise . unEscapeString -> path) a = - mkE ("No entry on path: " ++ path) $ findEntryByPath path a + mkE ("No entry on path: " <> T.pack path) $ findEntryByPath path a -parseXMLDocE :: PandocMonad m => String -> m Element -parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc +parseXMLDocE :: PandocMonad m => Entry -> m Element +parseXMLDocE entry = + either (throwError . PandocXMLError fp) return $ parseXMLElement doc + where + doc = UTF8.toTextLazy . fromEntry $ entry + fp = T.pack $ eRelativePath entry findElementE :: PandocMonad m => QName -> Element -> m Element -findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x +findElementE e x = + mkE ("Unable to find element: " <> tshow e) $ findElement e x -mkE :: PandocMonad m => String -> Maybe a -> m a -mkE s = maybe (throwError . PandocParseError $ T.pack s) return +mkE :: PandocMonad m => Text -> Maybe a -> m a +mkE s = maybe (throwError . PandocParseError $ s) return diff --git a/src/Text/Pandoc/Readers/FB2.hs b/src/Text/Pandoc/Readers/FB2.hs index b0d2f092b..84e5278db 100644 --- a/src/Text/Pandoc/Readers/FB2.hs +++ b/src/Text/Pandoc/Readers/FB2.hs @@ -25,13 +25,13 @@ TODO: module Text.Pandoc.Readers.FB2 ( readFB2 ) where import Control.Monad.Except (throwError) import Control.Monad.State.Strict -import Data.ByteString.Lazy.Char8 ( pack ) import Data.ByteString.Base64.Lazy import Data.Functor import Data.List (intersperse) import qualified Data.Map as M import Data.Text (Text) import qualified Data.Text as T +import qualified Data.Text.Lazy as TL import Data.Default import Data.Maybe import Text.HTML.TagSoup.Entity (lookupEntity) @@ -40,8 +40,9 @@ import Text.Pandoc.Class.PandocMonad (PandocMonad, insertMedia, report) import Text.Pandoc.Error import Text.Pandoc.Logging import Text.Pandoc.Options -import Text.Pandoc.Shared (crFilter) -import Text.XML.Light +import Text.Pandoc.XML.Light +import qualified Text.Pandoc.UTF8 as UTF8 +import Text.Pandoc.Sources (ToSources(..), sourcesToText) type FB2 m = StateT FB2State m @@ -62,12 +63,15 @@ instance HasMeta FB2State where setMeta field v s = s {fb2Meta = setMeta field v (fb2Meta s)} deleteMeta field s = s {fb2Meta = deleteMeta field (fb2Meta s)} -readFB2 :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readFB2 :: (PandocMonad m, ToSources a) + => ReaderOptions + -> a + -> m Pandoc readFB2 _ inp = - case parseXMLDoc $ crFilter inp of - Nothing -> throwError $ PandocParseError "Not an XML document" - Just e -> do - (bs, st) <- runStateT (parseRootElement e) def + case parseXMLElement $ TL.fromStrict $ sourcesToText $ toSources inp of + Left msg -> throwError $ PandocXMLError "" msg + Right el -> do + (bs, st) <- runStateT (parseRootElement el) def let authors = if null $ fb2Authors st then id else setMeta "author" (map text $ reverse $ fb2Authors st) @@ -83,12 +87,12 @@ removeHash t = case T.uncons t of Just ('#', xs) -> xs _ -> t -convertEntity :: String -> Text -convertEntity e = maybe (T.toUpper $ T.pack e) T.pack $ lookupEntity e +convertEntity :: Text -> Text +convertEntity e = maybe (T.toUpper e) T.pack $ lookupEntity (T.unpack e) parseInline :: PandocMonad m => Content -> FB2 m Inlines parseInline (Elem e) = - case T.pack $ qName $ elName e of + case qName $ elName e of "strong" -> strong <$> parseStyleType e "emphasis" -> emph <$> parseStyleType e "style" -> parseNamedStyle e @@ -96,12 +100,12 @@ parseInline (Elem e) = "strikethrough" -> strikeout <$> parseStyleType e "sub" -> subscript <$> parseStyleType e "sup" -> superscript <$> parseStyleType e - "code" -> pure $ code $ T.pack $ strContent e + "code" -> pure $ code $ strContent e "image" -> parseInlineImageElement e name -> do report $ IgnoredElement name pure mempty -parseInline (Text x) = pure $ text $ T.pack $ cdData x +parseInline (Text x) = pure $ text $ cdData x parseInline (CRef r) = pure $ str $ convertEntity r parseSubtitle :: PandocMonad m => Element -> FB2 m Blocks @@ -111,7 +115,7 @@ parseSubtitle e = headerWith ("", ["unnumbered"], []) <$> gets fb2SectionLevel < parseRootElement :: PandocMonad m => Element -> FB2 m Blocks parseRootElement e = - case T.pack $ qName $ elName e of + case qName $ elName e of "FictionBook" -> do -- Parse notes before parsing the rest of the content. case filterChild isNotesBody e of @@ -144,7 +148,7 @@ parseNote e = Just sectionId -> do content <- mconcat <$> mapM parseSectionChild (dropTitle $ elChildren e) oldNotes <- gets fb2Notes - modify $ \s -> s { fb2Notes = M.insert ("#" <> T.pack sectionId) content oldNotes } + modify $ \s -> s { fb2Notes = M.insert ("#" <> sectionId) content oldNotes } pure () where isTitle x = qName (elName x) == "title" @@ -156,7 +160,7 @@ parseNote e = -- | Parse a child of @\<FictionBook>@ element. parseFictionBookChild :: PandocMonad m => Element -> FB2 m Blocks parseFictionBookChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "stylesheet" -> pure mempty -- stylesheet is ignored "description" -> mempty <$ mapM_ parseDescriptionChild (elChildren e) "body" -> if isNotesBody e @@ -168,7 +172,7 @@ parseFictionBookChild e = -- | Parse a child of @\<description>@ element. parseDescriptionChild :: PandocMonad m => Element -> FB2 m () parseDescriptionChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "title-info" -> mapM_ parseTitleInfoChild (elChildren e) "src-title-info" -> pure () -- ignore "document-info" -> pure () @@ -182,7 +186,7 @@ parseDescriptionChild e = -- | Parse a child of @\<body>@ element. parseBodyChild :: PandocMonad m => Element -> FB2 m Blocks parseBodyChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "image" -> parseImageElement e "title" -> header <$> gets fb2SectionLevel <*> parseTitleType (elContent e) "epigraph" -> parseEpigraph e @@ -196,7 +200,10 @@ parseBinaryElement e = (Nothing, _) -> report $ IgnoredElement "binary without id attribute" (Just _, Nothing) -> report $ IgnoredElement "binary without content-type attribute" - (Just filename, contentType) -> insertMedia filename (T.pack <$> contentType) (decodeLenient (pack (strContent e))) + (Just filename, contentType) -> + insertMedia (T.unpack filename) contentType + (decodeLenient + (UTF8.fromTextLazy . TL.fromStrict . strContent $ e)) -- * Type parsers @@ -206,13 +213,13 @@ parseAuthor e = T.unwords . catMaybes <$> mapM parseAuthorChild (elChildren e) parseAuthorChild :: PandocMonad m => Element -> FB2 m (Maybe Text) parseAuthorChild e = - case T.pack $ qName $ elName e of - "first-name" -> pure $ Just $ T.pack $ strContent e - "middle-name" -> pure $ Just $ T.pack $ strContent e - "last-name" -> pure $ Just $ T.pack $ strContent e - "nickname" -> pure $ Just $ T.pack $ strContent e - "home-page" -> pure $ Just $ T.pack $ strContent e - "email" -> pure $ Just $ T.pack $ strContent e + case qName $ elName e of + "first-name" -> pure $ Just $ strContent e + "middle-name" -> pure $ Just $ strContent e + "last-name" -> pure $ Just $ strContent e + "nickname" -> pure $ Just $ strContent e + "home-page" -> pure $ Just $ strContent e + "email" -> pure $ Just $ strContent e name -> do report $ IgnoredElement $ name <> " in author" pure Nothing @@ -236,13 +243,13 @@ parseTitleContent _ = pure Nothing parseImageElement :: PandocMonad m => Element -> FB2 m Blocks parseImageElement e = case href of - Just src -> pure $ para $ imageWith (imgId, [], []) (removeHash $ T.pack src) title alt + Just src -> pure $ para $ imageWith (imgId, [], []) (removeHash src) title alt Nothing -> do report $ IgnoredElement " image without href" pure mempty - where alt = maybe mempty (str . T.pack) $ findAttr (unqual "alt") e - title = maybe "" T.pack $ findAttr (unqual "title") e - imgId = maybe "" T.pack $ findAttr (unqual "id") e + where alt = maybe mempty str $ findAttr (unqual "alt") e + title = fromMaybe "" $ findAttr (unqual "title") e + imgId = fromMaybe "" $ findAttr (unqual "id") e href = findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e -- | Parse @pType@ @@ -256,7 +263,7 @@ parseCite e = blockQuote . mconcat <$> mapM parseCiteChild (elChildren e) -- | Parse @citeType@ child parseCiteChild :: PandocMonad m => Element -> FB2 m Blocks parseCiteChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "p" -> para <$> parsePType e "poem" -> parsePoem e "empty-line" -> pure horizontalRule @@ -271,13 +278,13 @@ parsePoem e = mconcat <$> mapM parsePoemChild (elChildren e) parsePoemChild :: PandocMonad m => Element -> FB2 m Blocks parsePoemChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "title" -> parseTitle e "subtitle" -> parseSubtitle e "epigraph" -> parseEpigraph e "stanza" -> parseStanza e "text-author" -> para <$> parsePType e - "date" -> pure $ para $ text $ T.pack $ strContent e + "date" -> pure $ para $ text $ strContent e name -> report (UnexpectedXmlElement name "poem") $> mempty parseStanza :: PandocMonad m => Element -> FB2 m Blocks @@ -290,7 +297,7 @@ joinLineBlocks [] = [] parseStanzaChild :: PandocMonad m => Element -> FB2 m Blocks parseStanzaChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "title" -> parseTitle e "subtitle" -> parseSubtitle e "v" -> lineBlock . (:[]) <$> parsePType e @@ -300,11 +307,11 @@ parseStanzaChild e = parseEpigraph :: PandocMonad m => Element -> FB2 m Blocks parseEpigraph e = divWith (divId, ["epigraph"], []) . mconcat <$> mapM parseEpigraphChild (elChildren e) - where divId = maybe "" T.pack $ findAttr (unqual "id") e + where divId = fromMaybe "" $ findAttr (unqual "id") e parseEpigraphChild :: PandocMonad m => Element -> FB2 m Blocks parseEpigraphChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "p" -> para <$> parsePType e "poem" -> parsePoem e "cite" -> parseCite e @@ -318,7 +325,7 @@ parseAnnotation e = mconcat <$> mapM parseAnnotationChild (elChildren e) parseAnnotationChild :: PandocMonad m => Element -> FB2 m Blocks parseAnnotationChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "p" -> para <$> parsePType e "poem" -> parsePoem e "cite" -> parseCite e @@ -332,14 +339,14 @@ parseSection :: PandocMonad m => Element -> FB2 m Blocks parseSection e = do n <- gets fb2SectionLevel modify $ \st -> st{ fb2SectionLevel = n + 1 } - let sectionId = maybe "" T.pack $ findAttr (unqual "id") e + let sectionId = fromMaybe "" $ findAttr (unqual "id") e bs <- divWith (sectionId, ["section"], []) . mconcat <$> mapM parseSectionChild (elChildren e) modify $ \st -> st{ fb2SectionLevel = n } pure bs parseSectionChild :: PandocMonad m => Element -> FB2 m Blocks parseSectionChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "title" -> parseBodyChild e "epigraph" -> parseEpigraph e "image" -> parseImageElement e @@ -361,16 +368,16 @@ parseStyleType e = mconcat <$> mapM parseInline (elContent e) parseNamedStyle :: PandocMonad m => Element -> FB2 m Inlines parseNamedStyle e = do content <- mconcat <$> mapM parseNamedStyleChild (elContent e) - let lang = maybeToList $ ("lang",) . T.pack <$> findAttr (QName "lang" Nothing (Just "xml")) e + let lang = maybeToList $ ("lang",) <$> findAttr (QName "lang" Nothing (Just "xml")) e case findAttr (unqual "name") e of - Just name -> pure $ spanWith ("", [T.pack name], lang) content + Just name -> pure $ spanWith ("", [name], lang) content Nothing -> do report $ IgnoredElement "link without required name" pure mempty parseNamedStyleChild :: PandocMonad m => Content -> FB2 m Inlines parseNamedStyleChild (Elem e) = - case T.pack $ qName (elName e) of + case qName (elName e) of "strong" -> strong <$> parseStyleType e "emphasis" -> emph <$> parseStyleType e "style" -> parseNamedStyle e @@ -378,7 +385,7 @@ parseNamedStyleChild (Elem e) = "strikethrough" -> strikeout <$> parseStyleType e "sub" -> subscript <$> parseStyleType e "sup" -> superscript <$> parseStyleType e - "code" -> pure $ code $ T.pack $ strContent e + "code" -> pure $ code $ strContent e "image" -> parseInlineImageElement e name -> do report $ IgnoredElement $ name <> " in style" @@ -390,7 +397,7 @@ parseLinkType :: PandocMonad m => Element -> FB2 m Inlines parseLinkType e = do content <- mconcat <$> mapM parseStyleLinkType (elContent e) notes <- gets fb2Notes - case T.pack <$> findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of + case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of Just href -> case findAttr (QName "type" Nothing Nothing) e of Just "note" -> case M.lookup href notes of Nothing -> pure $ link href "" content @@ -417,15 +424,14 @@ parseTable _ = pure mempty -- TODO: tables are not supported yet -- | Parse @title-infoType@ parseTitleInfoChild :: PandocMonad m => Element -> FB2 m () parseTitleInfoChild e = - case T.pack $ qName (elName e) of + case qName (elName e) of "genre" -> pure () "author" -> parseAuthor e >>= \author -> modify (\st -> st {fb2Authors = author:fb2Authors st}) - "book-title" -> modify (setMeta "title" (text $ T.pack $ strContent e)) + "book-title" -> modify (setMeta "title" (text $ strContent e)) "annotation" -> parseAnnotation e >>= modify . setMeta "abstract" "keywords" -> modify (setMeta "keywords" (map (MetaString . trim) $ T.splitOn "," - $ T.pack $ strContent e)) - "date" -> modify (setMeta "date" (text $ T.pack $ strContent e)) + "date" -> modify (setMeta "date" (text $ strContent e)) "coverpage" -> parseCoverPage e "lang" -> pure () "src-lang" -> pure () @@ -439,7 +445,7 @@ parseCoverPage e = Just img -> case href of Just src -> modify (setMeta "cover-image" (MetaString $ removeHash src)) Nothing -> pure () - where href = T.pack <$> findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) img + where href = findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) img Nothing -> pure () -- | Parse @inlineImageType@ element @@ -452,5 +458,5 @@ parseInlineImageElement e = Nothing -> do report $ IgnoredElement "inline image without href" pure mempty - where alt = maybe mempty (str . T.pack) $ findAttr (unqual "alt") e - href = T.pack <$> findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e + where alt = maybe mempty str $ findAttr (unqual "alt") e + href = findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index eb78979a3..fdf4f28e0 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -5,7 +5,7 @@ {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Readers.HTML - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -19,21 +19,20 @@ module Text.Pandoc.Readers.HTML ( readHtml , htmlInBalanced , isInlineTag , isBlockTag - , NamedTag(..) , isTextTag , isCommentTag ) where import Control.Applicative ((<|>)) -import Control.Arrow (first) import Control.Monad (guard, msum, mzero, unless, void) -import Control.Monad.Except (throwError) +import Control.Monad.Except (throwError, catchError) import Control.Monad.Reader (ask, asks, lift, local, runReaderT) import Data.ByteString.Base64 (encode) import Data.Char (isAlphaNum, isLetter) import Data.Default (Default (..), def) import Data.Foldable (for_) import Data.List.Split (splitWhen) +import Data.List (foldl') import qualified Data.Map as M import Data.Maybe (fromMaybe, isJust, isNothing) import Data.Monoid (First (..)) @@ -62,21 +61,22 @@ import Text.Pandoc.Options ( ReaderOptions (readerExtensions, readerStripComments), extensionEnabled) import Text.Pandoc.Parsing hiding ((<|>)) -import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI, - extractSpaces, htmlSpanLikeElements, safeRead, tshow) +import Text.Pandoc.Shared ( + addMetaField, blocksToInlines', escapeURI, extractSpaces, + htmlSpanLikeElements, renderTags', safeRead, tshow) import Text.Pandoc.Walk import Text.Parsec.Error import Text.TeXMath (readMathML, writeTeX) -- | Convert HTML-formatted string to 'Pandoc' document. -readHtml :: PandocMonad m +readHtml :: (PandocMonad m, ToSources a) => ReaderOptions -- ^ Reader options - -> Text -- ^ String to parse (assumes @'\n'@ line endings) + -> a -- ^ Input to parse -> m Pandoc readHtml opts inp = do - let tags = stripPrefixes . canonicalizeTags $ + let tags = stripPrefixes $ canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } - (crFilter inp) + (sourcesToText $ toSources inp) parseDoc = do blocks <- fixPlains False . mconcat <$> manyTill block eof meta <- stateMeta . parserState <$> getState @@ -95,6 +95,15 @@ readHtml opts inp = do Right doc -> return doc Left err -> throwError $ PandocParseError $ T.pack $ getError err +-- Strip namespace prefixes on tags (not attributes) +stripPrefixes :: [Tag Text] -> [Tag Text] +stripPrefixes = map stripPrefix + +stripPrefix :: Tag Text -> Tag Text +stripPrefix (TagOpen s as) = TagOpen (T.takeWhileEnd (/=':') s) as +stripPrefix (TagClose s) = TagClose (T.takeWhileEnd (/=':') s) +stripPrefix x = x + replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block] replaceNotes bs = do st <- getState @@ -112,14 +121,18 @@ setInPlain :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a setInPlain = local (\s -> s {inPlain = True}) pHtml :: PandocMonad m => TagParser m Blocks -pHtml = try $ do +pHtml = do (TagOpen "html" attr) <- lookAhead pAny - for_ (lookup "lang" attr) $ + for_ (lookup "lang" attr <|> lookup "xml:lang" attr) $ updateState . B.setMeta "lang" . B.text pInTags "html" block pBody :: PandocMonad m => TagParser m Blocks -pBody = pInTags "body" block +pBody = do + (TagOpen "body" attr) <- lookAhead pAny + for_ (lookup "lang" attr <|> lookup "xml:lang" attr) $ + updateState . B.setMeta "lang" . B.text + pInTags "body" block pHead :: PandocMonad m => TagParser m Blocks pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAny) @@ -145,32 +158,65 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAny) return mempty block :: PandocMonad m => TagParser m Blocks -block = do - res <- choice - [ eSection - , eSwitch B.para block - , mempty <$ eFootnote - , mempty <$ eTOC - , mempty <$ eTitlePage - , pPara - , pHeader - , pBlockQuote - , pCodeBlock - , pList - , pHrule - , pTable block - , pHtml - , pHead - , pBody - , pLineBlock - , pDiv - , pPlain - , pFigure - , pIframe - , pRawHtmlBlock - ] - trace (T.take 60 $ tshow $ B.toList res) - return res +block = ((do + tag <- lookAhead (pSatisfy isBlockTag) + exts <- getOption readerExtensions + case tag of + TagOpen name attr -> + let type' = fromMaybe "" $ + lookup "type" attr <|> lookup "epub:type" attr + epubExts = extensionEnabled Ext_epub_html_exts exts + in + case name of + _ | name `elem` sectioningContent + , epubExts + , "chapter" `T.isInfixOf` type' + -> eSection + _ | epubExts + , type' `elem` ["footnote", "rearnote"] + -> mempty <$ eFootnote + _ | epubExts + , type' == "toc" + -> mempty <$ eTOC + _ | "titlepage" `T.isInfixOf` type' + , name `elem` ("section" : groupingContent) + -> mempty <$ eTitlePage + "p" -> pPara + "h1" -> pHeader + "h2" -> pHeader + "h3" -> pHeader + "h4" -> pHeader + "h5" -> pHeader + "h6" -> pHeader + "blockquote" -> pBlockQuote + "pre" -> pCodeBlock + "ul" -> pBulletList + "ol" -> pOrderedList + "dl" -> pDefinitionList + "table" -> pTable block + "hr" -> pHrule + "html" -> pHtml + "head" -> pHead + "body" -> pBody + "div" + | extensionEnabled Ext_line_blocks exts + , Just "line-block" <- lookup "class" attr + -> pLineBlock + | otherwise + -> pDiv + "section" -> pDiv + "header" -> pDiv + "main" -> pDiv + "figure" -> pFigure + "iframe" -> pIframe + "style" -> pRawHtmlBlock + "textarea" -> pRawHtmlBlock + "switch" + | epubExts + -> eSwitch B.para block + _ -> mzero + _ -> mzero) <|> pPlain <|> pRawHtmlBlock) >>= \res -> + res <$ trace (T.take 60 $ tshow $ B.toList res) namespaces :: PandocMonad m => [(Text, TagParser m Inlines)] namespaces = [(mathMLNamespace, pMath True)] @@ -243,9 +289,6 @@ eTOC = try $ do guard $ (lookup "type" attr <|> lookup "epub:type" attr) == Just "toc" void (pInTags tag block) -pList :: PandocMonad m => TagParser m Blocks -pList = pBulletList <|> pOrderedList <|> pDefinitionList - pBulletList :: PandocMonad m => TagParser m Blocks pBulletList = try $ do pSatisfy (matchTagOpen "ul" []) @@ -319,7 +362,10 @@ pDefListItem = try $ do terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline) defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block) skipMany nonItem - let term = foldl1 (\x y -> x <> B.linebreak <> y) $ map trimInlines terms + let term = foldl' (\x y -> if null x + then trimInlines y + else x <> B.linebreak <> trimInlines y) + mempty terms return (term, map (fixPlains True) defs) fixPlains :: Bool -> Blocks -> Blocks @@ -356,13 +402,16 @@ pLineBlock = try $ do B.toList ils return $ B.lineBlock lns +isDivLike :: Text -> Bool +isDivLike "div" = True +isDivLike "section" = True +isDivLike "header" = True +isDivLike "main" = True +isDivLike _ = False + pDiv :: PandocMonad m => TagParser m Blocks pDiv = try $ do guardEnabled Ext_native_divs - let isDivLike "div" = True - isDivLike "section" = True - isDivLike "main" = True - isDivLike _ = False TagOpen tag attr' <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True) let (ident, classes, kvs) = toAttr attr' contents <- pInTags tag block @@ -380,11 +429,17 @@ pIframe = try $ do tag <- pSatisfy (tagOpen (=="iframe") (isJust . lookup "src")) pCloses "iframe" <|> eof url <- canonicalizeUrl $ fromAttrib "src" tag - (bs, _) <- openURL url - let inp = UTF8.toText bs - opts <- readerOpts <$> getState - Pandoc _ contents <- readHtml opts inp - return $ B.divWith ("",["iframe"],[]) $ B.fromList contents + if T.null url + then ignore $ renderTags' [tag, TagClose "iframe"] + else catchError + (do (bs, _) <- openURL url + let inp = UTF8.toText bs + opts <- readerOpts <$> getState + Pandoc _ contents <- readHtml opts inp + return $ B.divWith ("",["iframe"],[]) $ B.fromList contents) + (\e -> do + logMessage $ CouldNotFetchResource url (renderError e) + ignore $ renderTags' [tag, TagClose "iframe"]) pRawHtmlBlock :: PandocMonad m => TagParser m Blocks pRawHtmlBlock = do @@ -446,17 +501,13 @@ pHeader = try $ do tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"]) (const True) let attr = toStringAttr attr' - let bodyTitle = TagOpen tagtype attr' ~== TagOpen ("h1" :: Text) - [("class","title")] level <- headerLevel tagtype contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof) let ident = fromMaybe "" $ lookup "id" attr let classes = maybe [] T.words $ lookup "class" attr let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"] attr'' <- registerHeader (ident, classes, keyvals) contents - return $ if bodyTitle - then mempty -- skip a representation of the title in the body - else B.headerWith attr'' level contents + return $ B.headerWith attr'' level contents pHrule :: PandocMonad m => TagParser m Blocks pHrule = do @@ -506,7 +557,18 @@ pFigure = try $ do pCodeBlock :: PandocMonad m => TagParser m Blocks pCodeBlock = try $ do TagOpen _ attr' <- pSatisfy (matchTagOpen "pre" []) - let attr = toAttr attr' + -- if the `pre` has no attributes, try if it is followed by a `code` + -- element and use those attributes if possible. + attr <- case attr' of + _:_ -> pure (toAttr attr') + [] -> option nullAttr $ do + TagOpen _ codeAttr <- pSatisfy (matchTagOpen "code" []) + pure $ toAttr + [ (k, v') | (k, v) <- codeAttr + -- strip language from class + , let v' = if k == "class" + then fromMaybe v (T.stripPrefix "language-" v) + else v ] contents <- manyTill pAny (pCloses "pre" <|> eof) let rawText = T.concat $ map tagToText contents -- drop leading newline if any @@ -525,31 +587,47 @@ tagToText (TagOpen "br" _) = "\n" tagToText _ = "" inline :: PandocMonad m => TagParser m Inlines -inline = choice - [ eNoteref - , eSwitch id inline - , pTagText - , pQ - , pEmph - , pStrong - , pSuperscript - , pSubscript - , pSpanLike - , pSmall - , pStrikeout - , pUnderline - , pLineBreak - , pLink - , pImage - , pSvg - , pBdo - , pCode - , pCodeWithClass [("samp","sample"),("var","variable")] - , pSpan - , pMath False - , pScriptMath - , pRawHtmlInline - ] +inline = pTagText <|> do + tag <- lookAhead (pSatisfy isInlineTag) + exts <- getOption readerExtensions + case tag of + TagOpen name attr -> + case name of + "a" | extensionEnabled Ext_epub_html_exts exts + , Just "noteref" <- lookup "type" attr <|> lookup "epub:type" attr + , Just ('#',_) <- lookup "href" attr >>= T.uncons + -> eNoteref + | otherwise -> pLink + "switch" -> eSwitch id inline + "q" -> pQ + "em" -> pEmph + "i" -> pEmph + "strong" -> pStrong + "b" -> pStrong + "sup" -> pSuperscript + "sub" -> pSubscript + "small" -> pSmall + "s" -> pStrikeout + "strike" -> pStrikeout + "del" -> pStrikeout + "u" -> pUnderline + "ins" -> pUnderline + "br" -> pLineBreak + "img" -> pImage + "svg" -> pSvg + "bdo" -> pBdo + "code" -> pCode + "samp" -> pCodeWithClass "samp" "sample" + "var" -> pCodeWithClass "var" "variable" + "span" -> pSpan + "math" -> pMath False + "script" + | Just x <- lookup "type" attr + , "math/tex" `T.isPrefixOf` x -> pScriptMath + _ | name `elem` htmlSpanLikeElements -> pSpanLike + _ -> pRawHtmlInline + TagText _ -> pTagText + _ -> pRawHtmlInline pSelfClosing :: PandocMonad m => (Text -> Bool) -> ([Attribute Text] -> Bool) @@ -560,27 +638,25 @@ pSelfClosing f g = do return open pQ :: PandocMonad m => TagParser m Inlines -pQ = choice $ map try [citedQuote, normalQuote] - where citedQuote = do - tag <- pSatisfy $ tagOpenLit "q" (any ((=="cite") . fst)) - - url <- canonicalizeUrl $ fromAttrib "cite" tag - let uid = fromMaybe (fromAttrib "name" tag) $ - maybeFromAttrib "id" tag - let cls = T.words $ fromAttrib "class" tag - - makeQuote $ B.spanWith (uid, cls, [("cite", escapeURI url)]) - normalQuote = do - pSatisfy $ tagOpenLit "q" (const True) - makeQuote id - makeQuote wrapper = do - ctx <- asks quoteContext - let (constructor, innerContext) = case ctx of - InDoubleQuote -> (B.singleQuoted, InSingleQuote) - _ -> (B.doubleQuoted, InDoubleQuote) - - content <- withQuoteContext innerContext (mconcat <$> manyTill inline (pCloses "q")) - return $ extractSpaces (constructor . wrapper) content +pQ = do + TagOpen _ attrs <- pSatisfy $ tagOpenLit "q" (const True) + case lookup "cite" attrs of + Just url -> do + let uid = fromMaybe mempty $ + lookup "name" attrs <> lookup "id" attrs + let cls = maybe [] T.words $ lookup "class" attrs + url' <- canonicalizeUrl url + makeQuote $ B.spanWith (uid, cls, [("cite", escapeURI url')]) + Nothing -> makeQuote id + where + makeQuote wrapper = do + ctx <- asks quoteContext + let (constructor, innerContext) = case ctx of + InDoubleQuote -> (B.singleQuoted, InSingleQuote) + _ -> (B.doubleQuoted, InDoubleQuote) + content <- withQuoteContext innerContext + (mconcat <$> manyTill inline (pCloses "q")) + return $ extractSpaces (constructor . wrapper) content pEmph :: PandocMonad m => TagParser m Inlines pEmph = pInlinesInTags "em" B.emph <|> pInlinesInTags "i" B.emph @@ -646,17 +722,12 @@ pLink = try $ do pImage :: PandocMonad m => TagParser m Inlines pImage = do - tag <- pSelfClosing (=="img") (isJust . lookup "src") + tag@(TagOpen _ attr') <- pSelfClosing (=="img") (isJust . lookup "src") url <- canonicalizeUrl $ fromAttrib "src" tag let title = fromAttrib "title" tag let alt = fromAttrib "alt" tag - let uid = fromAttrib "id" tag - let cls = T.words $ fromAttrib "class" tag - let getAtt k = case fromAttrib k tag of - "" -> [] - v -> [(k, v)] - let kvs = concatMap getAtt ["width", "height", "sizes", "srcset"] - return $ B.imageWith (uid, cls, kvs) (escapeURI url) title (B.text alt) + let attr = toAttr $ filter (\(k,_) -> k /= "alt" && k /= "title" && k /= "src") attr' + return $ B.imageWith attr (escapeURI url) title (B.text alt) pSvg :: PandocMonad m => TagParser m Inlines pSvg = do @@ -671,13 +742,12 @@ pSvg = do UTF8.toText (encode $ UTF8.fromText rawText) return $ B.imageWith (ident,cls,[]) svgData mempty mempty -pCodeWithClass :: PandocMonad m => [(T.Text,Text)] -> TagParser m Inlines -pCodeWithClass elemToClass = try $ do - let tagTest = flip elem . fmap fst $ elemToClass - TagOpen open attr' <- pSatisfy $ tagOpen tagTest (const True) +pCodeWithClass :: PandocMonad m => Text -> Text -> TagParser m Inlines +pCodeWithClass name class' = try $ do + TagOpen open attr' <- pSatisfy $ tagOpen (== name) (const True) result <- manyTill pAny (pCloses open) let (ids,cs,kvs) = toAttr attr' - cs' = maybe cs (:cs) . lookup open $ elemToClass + cs' = class' : cs return . B.codeWith (ids,cs',kvs) . T.unwords . T.lines . innerText $ result @@ -764,17 +834,19 @@ pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline pTagText :: PandocMonad m => TagParser m Inlines pTagText = try $ do + pos <- getPosition (TagText str) <- pSatisfy isTagText st <- getState qu <- ask parsed <- lift $ lift $ - flip runReaderT qu $ runParserT (many pTagContents) st "text" str + flip runReaderT qu $ runParserT (many pTagContents) st "text" + (Sources [(pos, str)]) case parsed of Left _ -> throwError $ PandocParseError $ "Could not parse `" <> str <> "'" Right result -> return $ mconcat result -type InlinesParser m = HTMLParser m Text +type InlinesParser m = HTMLParser m Sources pTagContents :: PandocMonad m => InlinesParser m Inlines pTagContents = @@ -868,27 +940,23 @@ pSpace = many1 (satisfy isSpace) >>= \xs -> then return B.softbreak else return B.space -class NamedTag a where - getTagName :: a -> Maybe Text - -instance NamedTag (Tag Text) where - getTagName (TagOpen t _) = Just t - getTagName (TagClose t) = Just t - getTagName _ = Nothing - -instance NamedTag (Tag String) where - getTagName (TagOpen t _) = Just (T.pack t) - getTagName (TagClose t) = Just (T.pack t) - getTagName _ = Nothing - -isInlineTag :: NamedTag (Tag a) => Tag a -> Bool -isInlineTag t = - isCommentTag t || case getTagName t of - Nothing -> False - Just x -> x `Set.notMember` blockTags || - T.take 1 x == "?" -- processing instr. - -isBlockTag :: NamedTag (Tag a) => Tag a -> Bool +getTagName :: Tag Text -> Maybe Text +getTagName (TagOpen t _) = Just t +getTagName (TagClose t) = Just t +getTagName _ = Nothing + +isInlineTag :: Tag Text -> Bool +isInlineTag t = isCommentTag t || case t of + TagOpen "script" _ -> "math/tex" `T.isPrefixOf` fromAttrib "type" t + TagClose "script" -> True + TagOpen name _ -> isInlineTagName name + TagClose name -> isInlineTagName name + _ -> False + where isInlineTagName x = + x `Set.notMember` blockTags || + T.take 1 x == "?" -- processing instr. + +isBlockTag :: Tag Text -> Bool isBlockTag t = isBlockTagName || isTagComment t where isBlockTagName = case getTagName t of @@ -899,10 +967,10 @@ isBlockTag t = isBlockTagName || isTagComment t || x `Set.member` eitherBlockOrInline Nothing -> False -isTextTag :: Tag a -> Bool +isTextTag :: Tag Text -> Bool isTextTag = tagText (const True) -isCommentTag :: Tag a -> Bool +isCommentTag :: Tag Text -> Bool isCommentTag = tagComment (const True) --- parsers for use in markdown, textile readers @@ -910,13 +978,14 @@ isCommentTag = tagComment (const True) -- | Matches a stretch of HTML in balanced tags. htmlInBalanced :: Monad m => (Tag Text -> Bool) - -> ParserT Text st m Text + -> ParserT Sources st m Text htmlInBalanced f = try $ do lookAhead (char '<') - inp <- getInput - let ts = canonicalizeTags $ - parseTagsOptions parseOptions{ optTagWarning = True, - optTagPosition = True } inp + sources <- getInput + let ts = canonicalizeTags + $ parseTagsOptions parseOptions{ optTagWarning = True, + optTagPosition = True } + $ sourcesToText sources case ts of (TagPosition sr sc : t@(TagOpen tn _) : rest) -> do guard $ f t @@ -951,22 +1020,24 @@ htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts go n (t:ts') = (t :) <$> go n ts' go _ [] = mzero -hasTagWarning :: [Tag a] -> Bool +hasTagWarning :: [Tag Text] -> Bool hasTagWarning (TagWarning _:_) = True hasTagWarning _ = False -- | Matches a tag meeting a certain condition. htmlTag :: (HasReaderOptions st, Monad m) => (Tag Text -> Bool) - -> ParserT Text st m (Tag Text, Text) + -> ParserT Sources st m (Tag Text, Text) htmlTag f = try $ do lookAhead (char '<') startpos <- getPosition - inp <- getInput + sources <- getInput + let inp = sourcesToText sources let ts = canonicalizeTags $ parseTagsOptions parseOptions{ optTagWarning = False , optTagPosition = True } - (inp <> " ") -- add space to ensure that + (inp <> " ") + -- add space to ensure that -- we get a TagPosition after the tag (next, ln, col) <- case ts of (TagPosition{} : next : TagPosition ln col : _) @@ -1024,21 +1095,6 @@ htmlTag f = try $ do handleTag tagname _ -> mzero --- Strip namespace prefixes -stripPrefixes :: [Tag Text] -> [Tag Text] -stripPrefixes = map stripPrefix - -stripPrefix :: Tag Text -> Tag Text -stripPrefix (TagOpen s as) = - TagOpen (stripPrefix' s) (map (first stripPrefix') as) -stripPrefix (TagClose s) = TagClose (stripPrefix' s) -stripPrefix x = x - -stripPrefix' :: Text -> Text -stripPrefix' s = - if T.null t then s else T.drop 1 t - where (_, t) = T.span (/= ':') s - -- Utilities -- | Adjusts a url according to the document's base URL. @@ -1048,26 +1104,3 @@ canonicalizeUrl url = do return $ case (parseURIReference (T.unpack url), mbBaseHref) of (Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs) _ -> url - --- For now we need a special version here; the one in Shared has String type -renderTags' :: [Tag Text] -> Text -renderTags' = renderTagsOptions - renderOptions{ optMinimize = matchTags ["hr", "br", "img", - "meta", "link"] - , optRawTag = matchTags ["script", "style"] } - where matchTags tags = flip elem tags . T.toLower - - --- EPUB Specific --- --- -{- - -types :: [(String, ([String], Int))] -types = -- Document divisions - map (\s -> (s, (["section", "body"], 0))) - ["volume", "part", "chapter", "division"] - <> -- Document section and components - [ - ("abstract", ([], 0))] --} diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs index 2d58319da..bd8d7c96c 100644 --- a/src/Text/Pandoc/Readers/HTML/Parsing.hs +++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs @@ -2,7 +2,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.HTML.Parsing - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -161,10 +161,12 @@ _ `closes` "html" = False "li" `closes` "li" = True "th" `closes` t | t `elem` ["th","td"] = True "td" `closes` t | t `elem` ["th","td"] = True -"tr" `closes` t | t `elem` ["th","td","tr"] = True +"tr" `closes` t | t `elem` ["th","td","tr","colgroup"] = True "dd" `closes` t | t `elem` ["dt", "dd"] = True "dt" `closes` t | t `elem` ["dt","dd"] = True "rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True +"col" `closes` "col" = True +"colgroup" `closes` "col" = True "optgroup" `closes` "optgroup" = True "optgroup" `closes` "option" = True "option" `closes` "option" = True @@ -193,14 +195,20 @@ t1 `closes` t2 | _ `closes` _ = False toStringAttr :: [(Text, Text)] -> [(Text, Text)] -toStringAttr = map go +toStringAttr = foldr go [] where - go (x,y) = - case T.stripPrefix "data-" x of - Just x' | x' `Set.notMember` (html5Attributes <> - html4Attributes <> rdfaAttributes) - -> (x',y) - _ -> (x,y) + go :: (Text, Text) -> [(Text, Text)] -> [(Text, Text)] + -- treat xml:lang as lang + go ("xml:lang",y) ats = go ("lang",y) ats + -- prevent duplicate attributes + go (x,y) ats + | any (\(x',_) -> x == x') ats = ats + | otherwise = + case T.stripPrefix "data-" x of + Just x' | x' `Set.notMember` (html5Attributes <> + html4Attributes <> rdfaAttributes) + -> go (x',y) ats + _ -> (x,y):ats -- Unlike fromAttrib from tagsoup, this distinguishes -- between a missing attribute and an attribute with empty content. diff --git a/src/Text/Pandoc/Readers/HTML/Table.hs b/src/Text/Pandoc/Readers/HTML/Table.hs index 5a783988f..6e62e12f5 100644 --- a/src/Text/Pandoc/Readers/HTML/Table.hs +++ b/src/Text/Pandoc/Readers/HTML/Table.hs @@ -3,8 +3,8 @@ {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Readers.HTML.Table - Copyright : © 2006-2020 John MacFarlane, - 2020 Albert Krewinkel + Copyright : © 2006-2021 John MacFarlane, + 2020-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <albert@zeitkraut.de> @@ -17,6 +17,8 @@ module Text.Pandoc.Readers.HTML.Table (pTable) where import Control.Applicative ((<|>)) import Data.Maybe (fromMaybe) +import Data.Either (lefts, rights) +import Data.List.NonEmpty (nonEmpty) import Data.Text (Text) import Text.HTML.TagSoup import Text.Pandoc.Builder (Blocks) @@ -32,34 +34,51 @@ import Text.Pandoc.Shared (onlySimpleTableCells, safeRead) import qualified Data.Text as T import qualified Text.Pandoc.Builder as B --- | Parses a @<col>@ element, returning the column's width. Defaults to --- @'ColWidthDefault'@ if the width is not set or cannot be determined. -pCol :: PandocMonad m => TagParser m ColWidth +-- | Parses a @<col>@ element, returning the column's width. +-- An Either value is used: Left i means a "relative length" with +-- integral value i (see https://www.w3.org/TR/html4/types.html#h-6.6); +-- Right w means a regular width. Defaults to @'Right ColWidthDefault'@ +-- if the width is not set or cannot be determined. +pCol :: PandocMonad m => TagParser m (Either Int ColWidth) pCol = try $ do TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" []) let attribs = toStringAttr attribs' skipMany pBlank optional $ pSatisfy (matchTagClose "col") skipMany pBlank - let width = case lookup "width" attribs of + return $ case lookup "width" attribs of Nothing -> case lookup "style" attribs of Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs -> - fromMaybe 0.0 $ safeRead (T.filter - (`notElem` (" \t\r\n%'\";" :: [Char])) xs) - _ -> 0.0 + maybe (Right ColWidthDefault) (Right . ColWidth . (/ 100.0)) + $ safeRead (T.filter + (`notElem` (" \t\r\n%'\";" :: [Char])) xs) + _ -> Right ColWidthDefault + Just (T.unsnoc -> Just (xs, '*')) -> + maybe (Left 1) Left $ safeRead xs Just (T.unsnoc -> Just (xs, '%')) -> - fromMaybe 0.0 $ safeRead xs - _ -> 0.0 - if width > 0.0 - then return $ ColWidth $ width / 100.0 - else return ColWidthDefault + maybe (Right ColWidthDefault) + (Right . ColWidth . (/ 100.0)) $ safeRead xs + _ -> Right ColWidthDefault -pColgroup :: PandocMonad m => TagParser m [ColWidth] +pColgroup :: PandocMonad m => TagParser m [Either Int ColWidth] pColgroup = try $ do pSatisfy (matchTagOpen "colgroup" []) skipMany pBlank manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank +resolveRelativeLengths :: [Either Int ColWidth] -> [ColWidth] +resolveRelativeLengths ws = + let remaining = 1 - sum (map getColWidth $ rights ws) + relatives = sum $ lefts ws + relUnit = remaining / fromIntegral relatives + toColWidth (Right x) = x + toColWidth (Left i) = ColWidth (fromIntegral i * relUnit) + in map toColWidth ws + +getColWidth :: ColWidth -> Double +getColWidth ColWidthDefault = 0 +getColWidth (ColWidth w) = w + data CellType = HeaderCell | BodyCell @@ -181,7 +200,8 @@ pTable :: PandocMonad m pTable block = try $ do TagOpen _ attribs <- pSatisfy (matchTagOpen "table" []) <* skipMany pBlank caption <- option mempty $ pInTags "caption" block <* skipMany pBlank - widths <- ((mconcat <$> many1 pColgroup) <|> many pCol) <* skipMany pBlank + widths <- resolveRelativeLengths <$> + ((mconcat <$> many1 pColgroup) <|> many pCol) <* skipMany pBlank thead <- pTableHead block <* skipMany pBlank topfoot <- optionMaybe (pTableFoot block) <* skipMany pBlank tbodies <- many (pTableBody block) <* skipMany pBlank @@ -214,8 +234,9 @@ normalize :: [ColWidth] -> TableHead -> [TableBody] -> TableFoot -> Either String ([ColSpec], TableHead, [TableBody], TableFoot) normalize widths head' bodies foot = do let rows = headRows head' <> concatMap bodyRows bodies <> footRows foot - let rowLength = length . rowCells - let ncols = maximum (map rowLength rows) + let cellWidth (Cell _ _ _ (ColSpan cs) _) = cs + let rowLength = foldr (\cell acc -> cellWidth cell + acc) 0 . rowCells + let ncols = maybe 0 maximum $ nonEmpty $ map rowLength rows let tblType = tableType (map rowCells rows) -- fail on empty table if null rows diff --git a/src/Text/Pandoc/Readers/HTML/TagCategories.hs b/src/Text/Pandoc/Readers/HTML/TagCategories.hs index 4f82a1831..b7bd40fee 100644 --- a/src/Text/Pandoc/Readers/HTML/TagCategories.hs +++ b/src/Text/Pandoc/Readers/HTML/TagCategories.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.HTML.TagCategories - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Readers/HTML/Types.hs b/src/Text/Pandoc/Readers/HTML/Types.hs index a94eeb828..12c519ad6 100644 --- a/src/Text/Pandoc/Readers/HTML/Types.hs +++ b/src/Text/Pandoc/Readers/HTML/Types.hs @@ -2,7 +2,7 @@ {-# LANGUAGE MultiParamTypeClasses #-} {- | Module : Text.Pandoc.Readers.HTML.Types - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Readers/Haddock.hs b/src/Text/Pandoc/Readers/Haddock.hs index 25d69f040..67b3af2d3 100644 --- a/src/Text/Pandoc/Readers/Haddock.hs +++ b/src/Text/Pandoc/Readers/Haddock.hs @@ -17,8 +17,9 @@ module Text.Pandoc.Readers.Haddock import Control.Monad.Except (throwError) import Data.List (intersperse) +import Data.List.NonEmpty (nonEmpty) import Data.Maybe (fromMaybe) -import Data.Text (Text, unpack) +import Data.Text (unpack) import qualified Data.Text as T import Documentation.Haddock.Parser import Documentation.Haddock.Types as H @@ -28,15 +29,17 @@ import Text.Pandoc.Class.PandocMonad (PandocMonad) import Text.Pandoc.Definition import Text.Pandoc.Error import Text.Pandoc.Options -import Text.Pandoc.Shared (crFilter, splitTextBy, trim) +import Text.Pandoc.Sources (ToSources(..), sourcesToText) +import Text.Pandoc.Shared (splitTextBy, trim) -- | Parse Haddock markup and return a 'Pandoc' document. -readHaddock :: PandocMonad m +readHaddock :: (PandocMonad m, ToSources a) => ReaderOptions - -> Text + -> a -> m Pandoc -readHaddock opts s = case readHaddockEither opts (unpack (crFilter s)) of +readHaddock opts s = case readHaddockEither opts + (unpack . sourcesToText . toSources $ s) of Right result -> return result Left e -> throwError e @@ -92,7 +95,7 @@ docHToBlocks d' = then ([], map toCells bodyRows) else (toCells (head headerRows), map toCells (tail headerRows ++ bodyRows)) - colspecs = replicate (maximum (map length body)) + colspecs = replicate (maybe 0 maximum (nonEmpty (map length body))) (AlignDefault, ColWidthDefault) in B.table B.emptyCaption colspecs @@ -128,7 +131,8 @@ docHToInlines isCode d' = DocIdentifier s -> B.codeWith ("",["haskell","identifier"],[]) $ T.pack s _ -> mempty DocIdentifierUnchecked s -> B.codeWith ("",["haskell","identifier"],[]) $ T.pack s - DocModule s -> B.codeWith ("",["haskell","module"],[]) $ T.pack s + DocModule s -> B.codeWith ("",["haskell","module"],[]) $ + T.pack (modLinkName s) DocWarning _ -> mempty -- TODO DocEmphasis d -> B.emph (docHToInlines isCode d) DocMonospaced (DocString s) -> B.code $ T.pack s diff --git a/src/Text/Pandoc/Readers/Ipynb.hs b/src/Text/Pandoc/Readers/Ipynb.hs index a866e6ec3..cd1093109 100644 --- a/src/Text/Pandoc/Readers/Ipynb.hs +++ b/src/Text/Pandoc/Readers/Ipynb.hs @@ -4,7 +4,7 @@ {-# LANGUAGE ScopedTypeVariables #-} {- | Module : Text.Pandoc.Readers.Ipynb - Copyright : Copyright (C) 2019-2020 John MacFarlane + Copyright : Copyright (C) 2019-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -39,10 +39,12 @@ import Data.Aeson as Aeson import Control.Monad.Except (throwError) import Text.Pandoc.Readers.Markdown (readMarkdown) import qualified Text.Pandoc.UTF8 as UTF8 +import Text.Pandoc.Sources (ToSources(..), sourcesToText) -readIpynb :: PandocMonad m => ReaderOptions -> Text -> m Pandoc -readIpynb opts t = do - let src = BL.fromStrict (TE.encodeUtf8 t) +readIpynb :: (PandocMonad m, ToSources a) + => ReaderOptions -> a -> m Pandoc +readIpynb opts x = do + let src = BL.fromStrict . TE.encodeUtf8 . sourcesToText $ toSources x case eitherDecode src of Right (notebook4 :: Notebook NbV4) -> notebookToPandoc opts notebook4 Left _ -> diff --git a/src/Text/Pandoc/Readers/JATS.hs b/src/Text/Pandoc/Readers/JATS.hs index c638da519..9cdbf1611 100644 --- a/src/Text/Pandoc/Readers/JATS.hs +++ b/src/Text/Pandoc/Readers/JATS.hs @@ -14,7 +14,9 @@ Conversion of JATS XML to 'Pandoc' document. module Text.Pandoc.Readers.JATS ( readJATS ) where import Control.Monad.State.Strict -import Data.Char (isDigit, isSpace, toUpper) +import Control.Monad.Except (throwError) +import Text.Pandoc.Error (PandocError(..)) +import Data.Char (isDigit, isSpace) import Data.Default import Data.Generics import Data.List (foldl', intersperse) @@ -22,15 +24,17 @@ import qualified Data.Map as Map import Data.Maybe (maybeToList, fromMaybe) import Data.Text (Text) import qualified Data.Text as T +import qualified Data.Text.Lazy as TL import Text.HTML.TagSoup.Entity (lookupEntity) import Text.Pandoc.Builder import Text.Pandoc.Class.PandocMonad (PandocMonad) import Text.Pandoc.Options -import Text.Pandoc.Shared (crFilter, safeRead, extractSpaces) +import Text.Pandoc.Shared (safeRead, extractSpaces) import Text.TeXMath (readMathML, writeTeX) -import Text.XML.Light +import Text.Pandoc.XML.Light import qualified Data.Set as S (fromList, member) import Data.Set ((\\)) +import Text.Pandoc.Sources (ToSources(..), sourcesToText) type JATS m = StateT JATSState m @@ -49,42 +53,28 @@ instance Default JATSState where , jatsContent = [] } -readJATS :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readJATS :: (PandocMonad m, ToSources a) + => ReaderOptions + -> a + -> m Pandoc readJATS _ inp = do - let tree = normalizeTree . parseXML - $ T.unpack $ crFilter inp + let sources = toSources inp + tree <- either (throwError . PandocXMLError "") return $ + parseXMLContents (TL.fromStrict . sourcesToText $ sources) (bs, st') <- flip runStateT (def{ jatsContent = tree }) $ mapM parseBlock tree return $ Pandoc (jatsMeta st') (toList . mconcat $ bs) --- normalize input, consolidating adjacent Text and CRef elements -normalizeTree :: [Content] -> [Content] -normalizeTree = everywhere (mkT go) - where go :: [Content] -> [Content] - go (Text (CData CDataRaw _ _):xs) = xs - go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = - Text (CData CDataText (s1 ++ s2) z):xs - go (Text (CData CDataText s1 z):CRef r:xs) = - Text (CData CDataText (s1 ++ convertEntity r) z):xs - go (CRef r:Text (CData CDataText s1 z):xs) = - Text (CData CDataText (convertEntity r ++ s1) z):xs - go (CRef r1:CRef r2:xs) = - Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs - go xs = xs - -convertEntity :: String -> String -convertEntity e = Data.Maybe.fromMaybe (map toUpper e) (lookupEntity e) - -- convenience function to get an attribute value, defaulting to "" -attrValue :: String -> Element -> Text +attrValue :: Text -> Element -> Text attrValue attr = fromMaybe "" . maybeAttrValue attr -maybeAttrValue :: String -> Element -> Maybe Text +maybeAttrValue :: Text -> Element -> Maybe Text maybeAttrValue attr elt = - T.pack <$> lookupAttrBy (\x -> qName x == attr) (elAttribs elt) + lookupAttrBy (\x -> qName x == attr) (elAttribs elt) -- convenience function -named :: String -> Element -> Bool +named :: Text -> Element -> Bool named s e = qName (elName e) == s -- @@ -150,10 +140,10 @@ getBlocks e = mconcat <$> parseBlock :: PandocMonad m => Content -> JATS m Blocks parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE -parseBlock (Text (CData _ s _)) = if all isSpace s +parseBlock (Text (CData _ s _)) = if T.all isSpace s then return mempty - else return $ plain $ trimInlines $ text $ T.pack s -parseBlock (CRef x) = return $ plain $ str $ T.toUpper $ T.pack x + else return $ plain $ trimInlines $ text s +parseBlock (CRef x) = return $ plain $ str $ T.toUpper x parseBlock (Elem e) = case qName (elName e) of "p" -> parseMixed para (elContent e) @@ -202,7 +192,7 @@ parseBlock (Elem e) = "" -> [] x -> [x] return $ codeBlockWith (attrValue "id" e, classes', []) - $ trimNl $ textContentRecursive e + $ trimNl $ strContentRecursive e parseBlockquote = do attrib <- case filterChild (named "attribution") e of Nothing -> return mempty @@ -266,7 +256,7 @@ parseBlock (Elem e) = Just "center" -> AlignCenter _ -> AlignDefault let toWidth c = do - w <- findAttrText (unqual "colwidth") c + w <- findAttr (unqual "colwidth") c n <- safeRead $ "0" <> T.filter (\x -> isDigit x || x == '.') w if n > 0 then Just n else Nothing let numrows = foldl' max 0 $ map length bodyrows @@ -437,16 +427,10 @@ parseRef e = do Nothing -> return $ Map.insert "id" (toMetaValue refId) mempty -- TODO handle mixed-citation -findAttrText :: QName -> Element -> Maybe Text -findAttrText x = fmap T.pack . findAttr x - textContent :: Element -> Text -textContent = T.pack . strContent - -textContentRecursive :: Element -> Text -textContentRecursive = T.pack . strContentRecursive +textContent = strContent -strContentRecursive :: Element -> String +strContentRecursive :: Element -> Text strContentRecursive = strContent . (\e' -> e'{ elContent = map elementToStr $ elContent e' }) @@ -455,9 +439,9 @@ elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing elementToStr x = x parseInline :: PandocMonad m => Content -> JATS m Inlines -parseInline (Text (CData _ s _)) = return $ text $ T.pack s -parseInline (CRef ref) = - return . text . maybe (T.toUpper $ T.pack ref) T.pack $ lookupEntity ref +parseInline (Text (CData _ s _)) = return $ text s +parseInline (CRef ref) = return $ maybe (text $ T.toUpper ref) (text . T.pack) + $ lookupEntity (T.unpack ref) parseInline (Elem e) = case qName (elName e) of "italic" -> innerInlines emph @@ -502,9 +486,9 @@ parseInline (Elem e) = else linkWith attr ("#" <> rid) "" ils "ext-link" -> do ils <- innerInlines id - let title = fromMaybe "" $ findAttrText (QName "title" (Just "http://www.w3.org/1999/xlink") Nothing) e + let title = fromMaybe "" $ findAttr (QName "title" (Just "http://www.w3.org/1999/xlink") Nothing) e let href = case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of - Just h -> T.pack h + Just h -> h _ -> "#" <> attrValue "rid" e let ils' = if ils == mempty then str href else ils let attr = (attrValue "id" e, [], []) @@ -512,7 +496,8 @@ parseInline (Elem e) = "disp-formula" -> formula displayMath "inline-formula" -> formula math - "math" | qPrefix (elName e) == Just "mml" -> return . math $ mathML e + "math" | qURI (elName e) == Just "http://www.w3.org/1998/Math/MathML" + -> return . math $ mathML e "tex-math" -> return . math $ textContent e "email" -> return $ link ("mailto:" <> textContent e) "" @@ -524,7 +509,7 @@ parseInline (Elem e) = where innerInlines f = extractSpaces f . mconcat <$> mapM parseInline (elContent e) mathML x = - case readMathML . T.pack . showElement $ everywhere (mkT removePrefix) x of + case readMathML . showElement $ everywhere (mkT removePrefix) x of Left _ -> mempty Right m -> writeTeX m formula constructor = do @@ -535,11 +520,12 @@ parseInline (Elem e) = filterChildren isMathML whereToLook return . mconcat . take 1 . map constructor $ texMaths ++ mathMLs - isMathML x = qName (elName x) == "math" && - qPrefix (elName x) == Just "mml" + isMathML x = qName (elName x) == "math" && + qURI (elName x) == + Just "http://www.w3.org/1998/Math/MathML" removePrefix elname = elname { qPrefix = Nothing } codeWithLang = do let classes' = case attrValue "language" e of "" -> [] l -> [l] - return $ codeWith (attrValue "id" e,classes',[]) $ textContentRecursive e + return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e diff --git a/src/Text/Pandoc/Readers/Jira.hs b/src/Text/Pandoc/Readers/Jira.hs index 9266ce10d..cf111f173 100644 --- a/src/Text/Pandoc/Readers/Jira.hs +++ b/src/Text/Pandoc/Readers/Jira.hs @@ -2,7 +2,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Org - Copyright : © 2019-2020 Albert Krewinkel + Copyright : © 2019-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -20,18 +20,20 @@ import Text.Pandoc.Builder hiding (cell) import Text.Pandoc.Error (PandocError (PandocParseError)) import Text.Pandoc.Options (ReaderOptions) import Text.Pandoc.Shared (stringify) - +import Text.Pandoc.Sources (ToSources(..), sourcesToText) import qualified Text.Jira.Markup as Jira -- | Read Jira wiki markup. -readJira :: PandocMonad m +readJira :: (PandocMonad m, ToSources a) => ReaderOptions - -> Text + -> a -> m Pandoc -readJira _opts s = case parse s of - Right d -> return $ jiraToPandoc d - Left e -> throwError . PandocParseError $ - "Jira parse error" `append` pack (show e) +readJira _opts inp = do + let sources = toSources inp + case parse (sourcesToText sources) of + Right d -> return $ jiraToPandoc d + Left e -> throwError . PandocParseError $ + "Jira parse error" `append` pack (show e) jiraToPandoc :: Jira.Doc -> Pandoc jiraToPandoc (Jira.Doc blks) = doc $ foldMap jiraToPandocBlocks blks @@ -71,10 +73,10 @@ toPandocCodeBlocks langMay params txt = Nothing -> [] in codeBlockWith ("", classes, map paramToPair params) txt --- | Create a pandoc @'Div'@ +-- | Create a pandoc @'Div'@ from a panel. toPandocDiv :: [Jira.Parameter] -> [Jira.Block] -> Blocks toPandocDiv params = - divWith ("", [], map paramToPair params) . foldMap jiraToPandocBlocks + divWith ("", ["panel"], map paramToPair params) . foldMap jiraToPandocBlocks paramToPair :: Jira.Parameter -> (Text, Text) paramToPair (Jira.Parameter key value) = (key, value) @@ -170,6 +172,8 @@ jiraLinkToPandoc linkType alias url = Jira.Email -> link ("mailto:" <> url') "" alias' Jira.Attachment -> linkWith ("", ["attachment"], []) url' "" alias' Jira.User -> linkWith ("", ["user-account"], []) url' "" alias' + Jira.SmartCard -> linkWith ("", ["smart-card"], []) url' "" alias' + Jira.SmartLink -> linkWith ("", ["smart-link"], []) url' "" alias' -- | Get unicode representation of a Jira icon. iconUnicode :: Jira.Icon -> Text diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index cdccaa535..27c018e73 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -1,14 +1,10 @@ -{-# LANGUAGE BangPatterns #-} -{-# LANGUAGE CPP #-} -{-# LANGUAGE FlexibleInstances #-} -{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE PatternGuards #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Readers.LaTeX - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -22,50 +18,58 @@ module Text.Pandoc.Readers.LaTeX ( readLaTeX, applyMacros, rawLaTeXInline, rawLaTeXBlock, - inlineCommand, - tokenize, - untokenize + inlineCommand ) where import Control.Applicative (many, optional, (<|>)) import Control.Monad import Control.Monad.Except (throwError) -import Data.Char (isDigit, isLetter, toUpper, chr) +import Data.Char (isDigit, isLetter, isAlphaNum, toUpper, chr) import Data.Default -import Data.Functor (($>)) import Data.List (intercalate) import qualified Data.Map as M import Data.Maybe (fromMaybe, maybeToList) import qualified Data.Set as Set -import qualified Data.Sequence as Seq import Data.Text (Text) import qualified Data.Text as T +import Skylighting (defaultSyntaxMap) import System.FilePath (addExtension, replaceExtension, takeExtension) -import Text.Pandoc.BCP47 (Lang (..), renderLang) -import Text.Pandoc.Builder +import Text.Collate.Lang (renderLang) +import Text.Pandoc.Builder as B import Text.Pandoc.Class.PandocPure (PandocPure) import Text.Pandoc.Class.PandocMonad (PandocMonad (..), getResourcePath, - readFileFromDirs, report, setResourcePath, - setTranslations, translateTerm) + readFileFromDirs, report, + setResourcePath) import Text.Pandoc.Error (PandocError (PandocParseError, PandocParsecError)) -import Text.Pandoc.Highlighting (fromListingsLanguage, languagesByExtension) +import Text.Pandoc.Highlighting (languagesByExtension) import Text.Pandoc.ImageSize (numUnit, showFl) import Text.Pandoc.Logging import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (blankline, many, mathDisplay, mathInline, optional, space, spaces, withRaw, (<|>)) -import Text.Pandoc.Readers.LaTeX.Types (ExpansionPoint (..), Macro (..), - ArgSpec (..), Tok (..), TokType (..)) +import Text.Pandoc.Readers.LaTeX.Types (Tok (..), TokType (..)) import Text.Pandoc.Readers.LaTeX.Parsing -import Text.Pandoc.Readers.LaTeX.Lang (polyglossiaLangToBCP47, - babelLangToBCP47) -import Text.Pandoc.Readers.LaTeX.SIunitx +import Text.Pandoc.Readers.LaTeX.Citation (citationCommands, cites) +import Text.Pandoc.Readers.LaTeX.Math (dollarsMath, inlineEnvironments, + inlineEnvironment, + mathDisplay, mathInline, + newtheorem, theoremstyle, proof, + theoremEnvironment) +import Text.Pandoc.Readers.LaTeX.Table (tableEnvironments) +import Text.Pandoc.Readers.LaTeX.Macro (macroDef) +import Text.Pandoc.Readers.LaTeX.Lang (inlineLanguageCommands, + enquoteCommands, + babelLangToBCP47, setDefaultLanguage) +import Text.Pandoc.Readers.LaTeX.SIunitx (siunitxCommands) +import Text.Pandoc.Readers.LaTeX.Inline (acronymCommands, refCommands, + nameCommands, charCommands, + accentCommands, + biblatexInlineCommands, + verbCommands, rawInlineOr, + listingsLanguage) import Text.Pandoc.Shared -import qualified Text.Pandoc.Translations as Translations import Text.Pandoc.Walk -import qualified Text.Pandoc.Builder as B -import qualified Data.Text.Normalize as Normalize -import Safe +import Data.List.NonEmpty (nonEmpty) -- for debugging: -- import Text.Pandoc.Extensions (getDefaultExtensions) @@ -73,16 +77,17 @@ import Safe -- import Debug.Trace (traceShowId) -- | Parse LaTeX from string and return 'Pandoc' document. -readLaTeX :: PandocMonad m +readLaTeX :: (PandocMonad m, ToSources a) => ReaderOptions -- ^ Reader options - -> Text -- ^ String to parse (assumes @'\n'@ line endings) + -> a -- ^ Input to parse -> m Pandoc readLaTeX opts ltx = do + let sources = toSources ltx parsed <- runParserT parseLaTeX def{ sOptions = opts } "source" - (tokenize "source" (crFilter ltx)) + (tokenizeSources sources) case parsed of Right result -> return result - Left e -> throwError $ PandocParsecError ltx e + Left e -> throwError $ PandocParsecError sources e parseLaTeX :: PandocMonad m => LP m Pandoc parseLaTeX = do @@ -93,11 +98,7 @@ parseLaTeX = do let doc' = doc bs let headerLevel (Header n _ _) = [n] headerLevel _ = [] -#if MIN_VERSION_safe(0,3,18) - let bottomLevel = minimumBound 1 $ query headerLevel doc' -#else - let bottomLevel = minimumDef 1 $ query headerLevel doc' -#endif + let bottomLevel = maybe 1 minimum $ nonEmpty $ query headerLevel doc' let adjustHeaders m (Header n attr ils) = Header (n+m) attr ils adjustHeaders _ x = x let (Pandoc _ bs') = @@ -132,11 +133,10 @@ resolveRefs _ x = x rawLaTeXBlock :: (PandocMonad m, HasMacros s, HasReaderOptions s) - => ParserT Text s m Text + => ParserT Sources s m Text rawLaTeXBlock = do lookAhead (try (char '\\' >> letter)) - inp <- getInput - let toks = tokenize "source" inp + toks <- getInputTokens snd <$> (rawLaTeXParser toks False (macroDef (const mempty)) blocks <|> rawLaTeXParser toks True (do choice (map controlSeq @@ -163,14 +163,13 @@ beginOrEndCommand = try $ do (txt <> untokenize rawargs) rawLaTeXInline :: (PandocMonad m, HasMacros s, HasReaderOptions s) - => ParserT Text s m Text + => ParserT Sources s m Text rawLaTeXInline = do lookAhead (try (char '\\' >> letter)) - inp <- getInput - let toks = tokenize "source" inp + toks <- getInputTokens raw <- snd <$> ( rawLaTeXParser toks True - (mempty <$ (controlSeq "input" >> skipMany opt >> braced)) + (mempty <$ (controlSeq "input" >> skipMany rawopt >> braced)) inlines <|> rawLaTeXParser toks True (inlineEnvironment <|> inlineCommand') inlines @@ -178,11 +177,10 @@ rawLaTeXInline = do finalbraces <- mconcat <$> many (try (string "{}")) -- see #5439 return $ raw <> T.pack finalbraces -inlineCommand :: PandocMonad m => ParserT Text ParserState m Inlines +inlineCommand :: PandocMonad m => ParserT Sources ParserState m Inlines inlineCommand = do lookAhead (try (char '\\' >> letter)) - inp <- getInput - let toks = tokenize "source" inp + toks <- getInputTokens fst <$> rawLaTeXParser toks True (inlineEnvironment <|> inlineCommand') inlines @@ -191,12 +189,6 @@ inlineCommand = do word :: PandocMonad m => LP m Inlines word = str . untoken <$> satisfyTok isWordTok -regularSymbol :: PandocMonad m => LP m Inlines -regularSymbol = str . untoken <$> satisfyTok isRegularSymbol - where isRegularSymbol (Tok _ Symbol t) = not $ T.any isSpecial t - isRegularSymbol _ = False - isSpecial c = c `Set.member` specialChars - inlineGroup :: PandocMonad m => LP m Inlines inlineGroup = do ils <- grouped inline @@ -237,19 +229,6 @@ mkImage options (T.unpack -> src) = do _ -> return src return $ imageWith attr (T.pack src') "" alt -doxspace :: PandocMonad m => LP m Inlines -doxspace = - (space <$ lookAhead (satisfyTok startsWithLetter)) <|> return mempty - where startsWithLetter (Tok _ Word t) = - case T.uncons t of - Just (c, _) | isLetter c -> True - _ -> False - startsWithLetter _ = False - - -lit :: Text -> LP m Inlines -lit = pure . str - removeDoubleQuotes :: Text -> Text removeDoubleQuotes t = Data.Maybe.fromMaybe t $ T.stripPrefix "\"" t >>= T.stripSuffix "\"" @@ -296,23 +275,14 @@ quoted' f starter ender = do cs -> cs) else lit startchs -enquote :: PandocMonad m => Bool -> Maybe Text -> LP m Inlines -enquote starred mblang = do - skipopts - let lang = mblang >>= babelLangToBCP47 - let langspan = case lang of - Nothing -> id - Just l -> spanWith ("",[],[("lang", renderLang l)]) - quoteContext <- sQuoteContext <$> getState - if starred || quoteContext == InDoubleQuote - then singleQuoted . langspan <$> withQuoteContext InSingleQuote tok - else doubleQuoted . langspan <$> withQuoteContext InDoubleQuote tok +lit :: Text -> LP m Inlines +lit = pure . str blockquote :: PandocMonad m => Bool -> Maybe Text -> LP m Blocks blockquote cvariant mblang = do citepar <- if cvariant then (\xs -> para (cite xs mempty)) - <$> cites NormalCitation False + <$> cites inline NormalCitation False else option mempty $ para <$> bracketed inline let lang = mblang >>= babelLangToBCP47 let langdiv = case lang of @@ -323,224 +293,13 @@ blockquote cvariant mblang = do optional $ symbolIn (".:;?!" :: [Char]) -- currently ignored return $ blockQuote . langdiv $ (bs <> citepar) -doAcronym :: PandocMonad m => Text -> LP m Inlines -doAcronym form = do - acro <- braced - return . mconcat $ [spanWith ("",[],[("acronym-label", untokenize acro), - ("acronym-form", "singular+" <> form)]) - $ str $ untokenize acro] - -doAcronymPlural :: PandocMonad m => Text -> LP m Inlines -doAcronymPlural form = do - acro <- braced - plural <- lit "s" - return . mconcat $ [spanWith ("",[],[("acronym-label", untokenize acro), - ("acronym-form", "plural+" <> form)]) $ - mconcat [str $ untokenize acro, plural]] - -doverb :: PandocMonad m => LP m Inlines -doverb = do - Tok _ Symbol t <- anySymbol - marker <- case T.uncons t of - Just (c, ts) | T.null ts -> return c - _ -> mzero - withVerbatimMode $ - code . untokenize <$> - manyTill (notFollowedBy newlineTok >> verbTok marker) (symbol marker) - -verbTok :: PandocMonad m => Char -> LP m Tok -verbTok stopchar = do - t@(Tok pos toktype txt) <- anyTok - case T.findIndex (== stopchar) txt of - Nothing -> return t - Just i -> do - let (t1, t2) = T.splitAt i txt - inp <- getInput - setInput $ Tok (incSourceColumn pos i) Symbol (T.singleton stopchar) - : totoks (incSourceColumn pos (i + 1)) (T.drop 1 t2) ++ inp - return $ Tok pos toktype t1 - -listingsLanguage :: [(Text, Text)] -> Maybe Text -listingsLanguage opts = - case lookup "language" opts of - Nothing -> Nothing - Just l -> fromListingsLanguage l `mplus` Just l - -dolstinline :: PandocMonad m => LP m Inlines -dolstinline = do - options <- option [] keyvals - let classes = maybeToList $ listingsLanguage options - doinlinecode classes - -domintinline :: PandocMonad m => LP m Inlines -domintinline = do - skipopts - cls <- untokenize <$> braced - doinlinecode [cls] - -doinlinecode :: PandocMonad m => [Text] -> LP m Inlines -doinlinecode classes = do - Tok _ Symbol t <- anySymbol - marker <- case T.uncons t of - Just (c, ts) | T.null ts -> return c - _ -> mzero - let stopchar = if marker == '{' then '}' else marker - withVerbatimMode $ - codeWith ("",classes,[]) . T.map nlToSpace . untokenize <$> - manyTill (verbTok stopchar) (symbol stopchar) - -nlToSpace :: Char -> Char -nlToSpace '\n' = ' ' -nlToSpace x = x - -mathDisplay :: Text -> Inlines -mathDisplay = displayMath . trimMath - -mathInline :: Text -> Inlines -mathInline = math . trimMath - -dollarsMath :: PandocMonad m => LP m Inlines -dollarsMath = do - symbol '$' - display <- option False (True <$ symbol '$') - (do contents <- try $ untokenize <$> pDollarsMath 0 - if display - then mathDisplay contents <$ symbol '$' - else return $ mathInline contents) - <|> (guard display >> return (mathInline "")) - --- Int is number of embedded groupings -pDollarsMath :: PandocMonad m => Int -> LP m [Tok] -pDollarsMath n = do - tk@(Tok _ toktype t) <- anyTok - case toktype of - Symbol | t == "$" - , n == 0 -> return [] - | t == "\\" -> do - tk' <- anyTok - (tk :) . (tk' :) <$> pDollarsMath n - | t == "{" -> (tk :) <$> pDollarsMath (n+1) - | t == "}" -> - if n > 0 - then (tk :) <$> pDollarsMath (n-1) - else mzero - _ -> (tk :) <$> pDollarsMath n - --- citations - -addPrefix :: [Inline] -> [Citation] -> [Citation] -addPrefix p (k:ks) = k {citationPrefix = p ++ citationPrefix k} : ks -addPrefix _ _ = [] - -addSuffix :: [Inline] -> [Citation] -> [Citation] -addSuffix s ks@(_:_) = - let k = last ks - in init ks ++ [k {citationSuffix = citationSuffix k ++ s}] -addSuffix _ _ = [] - -simpleCiteArgs :: PandocMonad m => LP m [Citation] -simpleCiteArgs = try $ do - first <- optionMaybe $ toList <$> opt - second <- optionMaybe $ toList <$> opt - keys <- try $ bgroup *> manyTill citationLabel egroup - let (pre, suf) = case (first , second ) of - (Just s , Nothing) -> (mempty, s ) - (Just s , Just t ) -> (s , t ) - _ -> (mempty, mempty) - conv k = Citation { citationId = k - , citationPrefix = [] - , citationSuffix = [] - , citationMode = NormalCitation - , citationHash = 0 - , citationNoteNum = 0 - } - return $ addPrefix pre $ addSuffix suf $ map conv keys - -citationLabel :: PandocMonad m => LP m Text -citationLabel = do - sp - untokenize <$> - (many1 (satisfyTok isWordTok <|> symbolIn bibtexKeyChar) - <* sp - <* optional (symbol ',') - <* sp) - where bibtexKeyChar = ".:;?!`'()/*@_+=-&[]" :: [Char] - -cites :: PandocMonad m => CitationMode -> Bool -> LP m [Citation] -cites mode multi = try $ do - cits <- if multi - then do - multiprenote <- optionMaybe $ toList <$> paropt - multipostnote <- optionMaybe $ toList <$> paropt - let (pre, suf) = case (multiprenote, multipostnote) of - (Just s , Nothing) -> (mempty, s) - (Nothing , Just t) -> (mempty, t) - (Just s , Just t ) -> (s, t) - _ -> (mempty, mempty) - tempCits <- many1 simpleCiteArgs - case tempCits of - (k:ks) -> case ks of - (_:_) -> return $ (addMprenote pre k : init ks) ++ - [addMpostnote suf (last ks)] - _ -> return [addMprenote pre (addMpostnote suf k)] - _ -> return [[]] - else count 1 simpleCiteArgs - let cs = concat cits - return $ case mode of - AuthorInText -> case cs of - (c:rest) -> c {citationMode = mode} : rest - [] -> [] - _ -> map (\a -> a {citationMode = mode}) cs - where mprenote (k:ks) = (k:ks) ++ [Space] - mprenote _ = mempty - mpostnote (k:ks) = [Str ",", Space] ++ (k:ks) - mpostnote _ = mempty - addMprenote mpn (k:ks) = - let mpnfinal = case citationPrefix k of - (_:_) -> mprenote mpn - _ -> mpn - in addPrefix mpnfinal (k:ks) - addMprenote _ _ = [] - addMpostnote = addSuffix . mpostnote - -citation :: PandocMonad m => Text -> CitationMode -> Bool -> LP m Inlines -citation name mode multi = do - (c,raw) <- withRaw $ cites mode multi - return $ cite c (rawInline "latex" $ "\\" <> name <> untokenize raw) - -handleCitationPart :: Inlines -> [Citation] -handleCitationPart ils = - let isCite Cite{} = True - isCite _ = False - (pref, rest) = break isCite (toList ils) - in case rest of - (Cite cs _:suff) -> addPrefix pref $ addSuffix suff cs - _ -> [] - -complexNatbibCitation :: PandocMonad m => CitationMode -> LP m Inlines -complexNatbibCitation mode = try $ do - (cs, raw) <- - withRaw $ concat <$> do - bgroup - items <- mconcat <$> - many1 (notFollowedBy (symbol ';') >> inline) - `sepBy1` symbol ';' - egroup - return $ map handleCitationPart items - case cs of - [] -> mzero - (c:cits) -> return $ cite (c{ citationMode = mode }:cits) - (rawInline "latex" $ "\\citetext" <> untokenize raw) - -inNote :: Inlines -> Inlines -inNote ils = - note $ para $ ils <> str "." - inlineCommand' :: PandocMonad m => LP m Inlines inlineCommand' = try $ do Tok _ (CtrlSeq name) cmd <- anyControlSeq guard $ name /= "begin" && name /= "end" && name /= "and" - star <- option "" ("*" <$ symbol '*' <* sp) + star <- if T.all isAlphaNum name + then option "" ("*" <$ symbol '*' <* sp) + else pure "" overlay <- option "" overlaySpecification let name' = name <> star <> overlay let names = ordNub [name', name] -- check non-starred as fallback @@ -551,28 +310,8 @@ inlineCommand' = try $ do <|> ignore rawcommand lookupListDefault raw names inlineCommands - tok :: PandocMonad m => LP m Inlines -tok = try $ spaces >> grouped inline <|> inlineCommand' <|> singleChar' - where singleChar' = do - Tok _ _ t <- singleChar - return $ str t - -opt :: PandocMonad m => LP m Inlines -opt = do - toks <- try (sp *> bracketedToks <* sp) - -- now parse the toks as inlines - st <- getState - parsed <- runParserT (mconcat <$> many inline) st "bracketed option" toks - case parsed of - Right result -> return result - Left e -> throwError $ PandocParsecError (untokenize toks) e - -paropt :: PandocMonad m => LP m Inlines -paropt = parenWrapped inline - -inBrackets :: Inlines -> Inlines -inBrackets x = str "[" <> x <> str "]" +tok = tokWith inline unescapeURL :: Text -> Text unescapeURL = T.concat . go . T.splitOn "\\" @@ -585,381 +324,109 @@ unescapeURL = T.concat . go . T.splitOn "\\" , isEscapable c = t | otherwise = "\\" <> t -mathEnvWith :: PandocMonad m - => (Inlines -> a) -> Maybe Text -> Text -> LP m a -mathEnvWith f innerEnv name = f . mathDisplay . inner <$> mathEnv name - where inner x = case innerEnv of - Nothing -> x - Just y -> "\\begin{" <> y <> "}\n" <> x <> - "\\end{" <> y <> "}" - -mathEnv :: PandocMonad m => Text -> LP m Text -mathEnv name = do - skipopts - optional blankline - res <- manyTill anyTok (end_ name) - return $ stripTrailingNewlines $ untokenize res - -inlineEnvironment :: PandocMonad m => LP m Inlines -inlineEnvironment = try $ do - controlSeq "begin" - name <- untokenize <$> braced - M.findWithDefault mzero name inlineEnvironments - -inlineEnvironments :: PandocMonad m => M.Map Text (LP m Inlines) -inlineEnvironments = M.fromList [ - ("displaymath", mathEnvWith id Nothing "displaymath") - , ("math", math <$> mathEnv "math") - , ("equation", mathEnvWith id Nothing "equation") - , ("equation*", mathEnvWith id Nothing "equation*") - , ("gather", mathEnvWith id (Just "gathered") "gather") - , ("gather*", mathEnvWith id (Just "gathered") "gather*") - , ("multline", mathEnvWith id (Just "gathered") "multline") - , ("multline*", mathEnvWith id (Just "gathered") "multline*") - , ("eqnarray", mathEnvWith id (Just "aligned") "eqnarray") - , ("eqnarray*", mathEnvWith id (Just "aligned") "eqnarray*") - , ("align", mathEnvWith id (Just "aligned") "align") - , ("align*", mathEnvWith id (Just "aligned") "align*") - , ("alignat", mathEnvWith id (Just "aligned") "alignat") - , ("alignat*", mathEnvWith id (Just "aligned") "alignat*") - , ("dmath", mathEnvWith id Nothing "dmath") - , ("dmath*", mathEnvWith id Nothing "dmath*") - , ("dgroup", mathEnvWith id (Just "aligned") "dgroup") - , ("dgroup*", mathEnvWith id (Just "aligned") "dgroup*") - , ("darray", mathEnvWith id (Just "aligned") "darray") - , ("darray*", mathEnvWith id (Just "aligned") "darray*") - ] - inlineCommands :: PandocMonad m => M.Map Text (LP m Inlines) -inlineCommands = M.union inlineLanguageCommands $ M.fromList - [ ("emph", extractSpaces emph <$> tok) - , ("textit", extractSpaces emph <$> tok) - , ("textsl", extractSpaces emph <$> tok) - , ("textsc", extractSpaces smallcaps <$> tok) - , ("textsf", extractSpaces (spanWith ("",["sans-serif"],[])) <$> tok) - , ("textmd", extractSpaces (spanWith ("",["medium"],[])) <$> tok) - , ("textrm", extractSpaces (spanWith ("",["roman"],[])) <$> tok) - , ("textup", extractSpaces (spanWith ("",["upright"],[])) <$> tok) - , ("texttt", ttfamily) - , ("sout", extractSpaces strikeout <$> tok) - , ("alert", skipopts >> spanWith ("",["alert"],[]) <$> tok) -- beamer - , ("lq", return (str "‘")) - , ("rq", return (str "’")) - , ("textquoteleft", return (str "‘")) - , ("textquoteright", return (str "’")) - , ("textquotedblleft", return (str "“")) - , ("textquotedblright", return (str "”")) - , ("textsuperscript", extractSpaces superscript <$> tok) - , ("textsubscript", extractSpaces subscript <$> tok) - , ("textbackslash", lit "\\") - , ("backslash", lit "\\") - , ("slash", lit "/") - , ("textbf", extractSpaces strong <$> tok) - , ("textnormal", extractSpaces (spanWith ("",["nodecor"],[])) <$> tok) - , ("underline", underline <$> tok) - , ("ldots", lit "…") - , ("vdots", lit "\8942") - , ("dots", lit "…") - , ("mdots", lit "…") - , ("sim", lit "~") - , ("sep", lit ",") - , ("label", rawInlineOr "label" dolabel) - , ("ref", rawInlineOr "ref" $ doref "ref") - , ("cref", rawInlineOr "cref" $ doref "ref") -- from cleveref.sty - , ("vref", rawInlineOr "vref" $ doref "ref+page") -- from varioref.sty - , ("eqref", rawInlineOr "eqref" $ doref "eqref") -- from amsmath.sty - , ("mbox", rawInlineOr "mbox" $ processHBox <$> tok) - , ("hbox", rawInlineOr "hbox" $ processHBox <$> tok) - , ("lettrine", rawInlineOr "lettrine" lettrine) - , ("(", mathInline . untokenize <$> manyTill anyTok (controlSeq ")")) - , ("[", mathDisplay . untokenize <$> manyTill anyTok (controlSeq "]")) - , ("ensuremath", mathInline . untokenize <$> braced) - , ("texorpdfstring", const <$> tok <*> tok) - , ("P", lit "¶") - , ("S", lit "§") - , ("$", lit "$") - , ("%", lit "%") - , ("&", lit "&") - , ("#", lit "#") - , ("_", lit "_") - , ("{", lit "{") - , ("}", lit "}") - , ("qed", lit "\a0\x25FB") - -- old TeX commands - , ("em", extractSpaces emph <$> inlines) - , ("it", extractSpaces emph <$> inlines) - , ("sl", extractSpaces emph <$> inlines) - , ("bf", extractSpaces strong <$> inlines) - , ("tt", code . stringify . toList <$> inlines) - , ("rm", inlines) - , ("itshape", extractSpaces emph <$> inlines) - , ("slshape", extractSpaces emph <$> inlines) - , ("scshape", extractSpaces smallcaps <$> inlines) - , ("bfseries", extractSpaces strong <$> inlines) - , ("MakeUppercase", makeUppercase <$> tok) - , ("MakeTextUppercase", makeUppercase <$> tok) -- textcase - , ("uppercase", makeUppercase <$> tok) - , ("MakeLowercase", makeLowercase <$> tok) - , ("MakeTextLowercase", makeLowercase <$> tok) - , ("lowercase", makeLowercase <$> tok) - , ("/", pure mempty) -- italic correction - , ("aa", lit "å") - , ("AA", lit "Å") - , ("ss", lit "ß") - , ("o", lit "ø") - , ("O", lit "Ø") - , ("L", lit "Ł") - , ("l", lit "ł") - , ("ae", lit "æ") - , ("AE", lit "Æ") - , ("oe", lit "œ") - , ("OE", lit "Œ") - , ("pounds", lit "£") - , ("euro", lit "€") - , ("copyright", lit "©") - , ("textasciicircum", lit "^") - , ("textasciitilde", lit "~") - , ("H", accent '\779' Nothing) -- hungarumlaut - , ("`", accent '\768' (Just '`')) -- grave - , ("'", accent '\769' (Just '\'')) -- acute - , ("^", accent '\770' (Just '^')) -- circ - , ("~", accent '\771' (Just '~')) -- tilde - , ("\"", accent '\776' Nothing) -- umlaut - , (".", accent '\775' Nothing) -- dot - , ("=", accent '\772' Nothing) -- macron - , ("|", accent '\781' Nothing) -- vertical line above - , ("b", accent '\817' Nothing) -- macron below - , ("c", accent '\807' Nothing) -- cedilla - , ("G", accent '\783' Nothing) -- doublegrave - , ("h", accent '\777' Nothing) -- hookabove - , ("d", accent '\803' Nothing) -- dotbelow - , ("f", accent '\785' Nothing) -- inverted breve - , ("r", accent '\778' Nothing) -- ringabove - , ("t", accent '\865' Nothing) -- double inverted breve - , ("U", accent '\782' Nothing) -- double vertical line above - , ("v", accent '\780' Nothing) -- hacek - , ("u", accent '\774' Nothing) -- breve - , ("k", accent '\808' Nothing) -- ogonek - , ("textogonekcentered", accent '\808' Nothing) -- ogonek - , ("i", lit "ı") -- dotless i - , ("j", lit "ȷ") -- dotless j - , ("newtie", accent '\785' Nothing) -- inverted breve - , ("textcircled", accent '\8413' Nothing) -- combining circle - , ("\\", linebreak <$ (do inTableCell <- sInTableCell <$> getState - guard $ not inTableCell - optional opt - spaces)) - , (",", lit "\8198") - , ("@", pure mempty) - , (" ", lit "\160") - , ("ps", pure $ str "PS." <> space) - , ("TeX", lit "TeX") - , ("LaTeX", lit "LaTeX") - , ("bar", lit "|") - , ("textless", lit "<") - , ("textgreater", lit ">") - , ("thanks", skipopts >> note <$> grouped block) - , ("footnote", skipopts >> note <$> grouped block) - , ("passthrough", tok) -- \passthrough macro used by latex writer - -- for listings - , ("verb", doverb) - , ("lstinline", dolstinline) - , ("mintinline", domintinline) - , ("Verb", doverb) - , ("url", (\url -> link url "" (str url)) . unescapeURL . untokenize <$> - bracedUrl) - , ("nolinkurl", code . unescapeURL . untokenize <$> bracedUrl) - , ("href", do url <- bracedUrl - sp - link (unescapeURL $ untokenize url) "" <$> tok) - , ("includegraphics", do options <- option [] keyvals - src <- braced - mkImage options . unescapeURL . removeDoubleQuotes $ - untokenize src) - , ("enquote*", enquote True Nothing) - , ("enquote", enquote False Nothing) - -- foreignquote is supposed to use native quote marks - , ("foreignquote*", braced >>= enquote True . Just . untokenize) - , ("foreignquote", braced >>= enquote False . Just . untokenize) - -- hypehnquote uses regular quotes - , ("hyphenquote*", braced >>= enquote True . Just . untokenize) - , ("hyphenquote", braced >>= enquote False . Just . untokenize) - , ("figurename", doTerm Translations.Figure) - , ("prefacename", doTerm Translations.Preface) - , ("refname", doTerm Translations.References) - , ("bibname", doTerm Translations.Bibliography) - , ("chaptername", doTerm Translations.Chapter) - , ("partname", doTerm Translations.Part) - , ("contentsname", doTerm Translations.Contents) - , ("listfigurename", doTerm Translations.ListOfFigures) - , ("listtablename", doTerm Translations.ListOfTables) - , ("indexname", doTerm Translations.Index) - , ("abstractname", doTerm Translations.Abstract) - , ("tablename", doTerm Translations.Table) - , ("enclname", doTerm Translations.Encl) - , ("ccname", doTerm Translations.Cc) - , ("headtoname", doTerm Translations.To) - , ("pagename", doTerm Translations.Page) - , ("seename", doTerm Translations.See) - , ("seealsoname", doTerm Translations.SeeAlso) - , ("proofname", doTerm Translations.Proof) - , ("glossaryname", doTerm Translations.Glossary) - , ("lstlistingname", doTerm Translations.Listing) - , ("cite", citation "cite" NormalCitation False) - , ("Cite", citation "Cite" NormalCitation False) - , ("citep", citation "citep" NormalCitation False) - , ("citep*", citation "citep*" NormalCitation False) - , ("citeal", citation "citeal" NormalCitation False) - , ("citealp", citation "citealp" NormalCitation False) - , ("citealp*", citation "citealp*" NormalCitation False) - , ("autocite", citation "autocite" NormalCitation False) - , ("smartcite", citation "smartcite" NormalCitation False) - , ("footcite", inNote <$> citation "footcite" NormalCitation False) - , ("parencite", citation "parencite" NormalCitation False) - , ("supercite", citation "supercite" NormalCitation False) - , ("footcitetext", inNote <$> citation "footcitetext" NormalCitation False) - , ("citeyearpar", citation "citeyearpar" SuppressAuthor False) - , ("citeyear", citation "citeyear" SuppressAuthor False) - , ("autocite*", citation "autocite*" SuppressAuthor False) - , ("cite*", citation "cite*" SuppressAuthor False) - , ("parencite*", citation "parencite*" SuppressAuthor False) - , ("textcite", citation "textcite" AuthorInText False) - , ("citet", citation "citet" AuthorInText False) - , ("citet*", citation "citet*" AuthorInText False) - , ("citealt", citation "citealt" AuthorInText False) - , ("citealt*", citation "citealt*" AuthorInText False) - , ("textcites", citation "textcites" AuthorInText True) - , ("cites", citation "cites" NormalCitation True) - , ("autocites", citation "autocites" NormalCitation True) - , ("footcites", inNote <$> citation "footcites" NormalCitation True) - , ("parencites", citation "parencites" NormalCitation True) - , ("supercites", citation "supercites" NormalCitation True) - , ("footcitetexts", inNote <$> citation "footcitetexts" NormalCitation True) - , ("Autocite", citation "Autocite" NormalCitation False) - , ("Smartcite", citation "Smartcite" NormalCitation False) - , ("Footcite", inNote <$> citation "Footcite" NormalCitation False) - , ("Parencite", citation "Parencite" NormalCitation False) - , ("Supercite", citation "Supercite" NormalCitation False) - , ("Footcitetext", inNote <$> citation "Footcitetext" NormalCitation False) - , ("Citeyearpar", citation "Citeyearpar" SuppressAuthor False) - , ("Citeyear", citation "Citeyear" SuppressAuthor False) - , ("Autocite*", citation "Autocite*" SuppressAuthor False) - , ("Cite*", citation "Cite*" SuppressAuthor False) - , ("Parencite*", citation "Parencite*" SuppressAuthor False) - , ("Textcite", citation "Textcite" AuthorInText False) - , ("Textcites", citation "Textcites" AuthorInText True) - , ("Cites", citation "Cites" NormalCitation True) - , ("Autocites", citation "Autocites" NormalCitation True) - , ("Footcites", inNote <$> citation "Footcites" NormalCitation True) - , ("Parencites", citation "Parencites" NormalCitation True) - , ("Supercites", citation "Supercites" NormalCitation True) - , ("Footcitetexts", inNote <$> citation "Footcitetexts" NormalCitation True) - , ("citetext", complexNatbibCitation NormalCitation) - , ("citeauthor", (try (tok *> sp *> controlSeq "citetext") *> - complexNatbibCitation AuthorInText) - <|> citation "citeauthor" AuthorInText False) - , ("nocite", mempty <$ (citation "nocite" NormalCitation False >>= - addMeta "nocite")) - , ("hyperlink", hyperlink) - , ("hypertarget", hypertargetInline) - -- glossaries package - , ("gls", doAcronym "short") - , ("Gls", doAcronym "short") - , ("glsdesc", doAcronym "long") - , ("Glsdesc", doAcronym "long") - , ("GLSdesc", doAcronym "long") - , ("acrlong", doAcronym "long") - , ("Acrlong", doAcronym "long") - , ("acrfull", doAcronym "full") - , ("Acrfull", doAcronym "full") - , ("acrshort", doAcronym "abbrv") - , ("Acrshort", doAcronym "abbrv") - , ("glspl", doAcronymPlural "short") - , ("Glspl", doAcronymPlural "short") - , ("glsdescplural", doAcronymPlural "long") - , ("Glsdescplural", doAcronymPlural "long") - , ("GLSdescplural", doAcronymPlural "long") - -- acronyms package - , ("ac", doAcronym "short") - , ("acf", doAcronym "full") - , ("acs", doAcronym "abbrv") - , ("acl", doAcronym "long") - , ("acp", doAcronymPlural "short") - , ("acfp", doAcronymPlural "full") - , ("acsp", doAcronymPlural "abbrv") - , ("aclp", doAcronymPlural "long") - , ("Ac", doAcronym "short") - , ("Acf", doAcronym "full") - , ("Acs", doAcronym "abbrv") - , ("Acl", doAcronym "long") - , ("Acp", doAcronymPlural "short") - , ("Acfp", doAcronymPlural "full") - , ("Acsp", doAcronymPlural "abbrv") - , ("Aclp", doAcronymPlural "long") - -- siuntix - , ("si", skipopts *> dosi tok) - , ("SI", doSI tok) - , ("SIrange", doSIrange True tok) - , ("numrange", doSIrange False tok) - , ("numlist", doSInumlist) - , ("num", doSInum) - , ("ang", doSIang) - -- hyphenat - , ("bshyp", lit "\\\173") - , ("fshyp", lit "/\173") - , ("dothyp", lit ".\173") - , ("colonhyp", lit ":\173") - , ("hyp", lit "-") - , ("nohyphens", tok) - , ("textnhtt", ttfamily) - , ("nhttfamily", ttfamily) - -- LaTeX colors - , ("textcolor", coloredInline "color") - , ("colorbox", coloredInline "background-color") - -- fontawesome - , ("faCheck", lit "\10003") - , ("faClose", lit "\10007") - -- xspace - , ("xspace", doxspace) - -- etoolbox - , ("ifstrequal", ifstrequal) - , ("newtoggle", braced >>= newToggle) - , ("toggletrue", braced >>= setToggle True) - , ("togglefalse", braced >>= setToggle False) - , ("iftoggle", try $ ifToggle >> inline) - -- biblatex misc - , ("RN", romanNumeralUpper) - , ("Rn", romanNumeralLower) - -- babel - , ("foreignlanguage", foreignlanguage) - -- include - , ("input", rawInlineOr "input" $ include "input") - -- soul package - , ("ul", underline <$> tok) - -- ulem package - , ("uline", underline <$> tok) - -- plain tex stuff that should just be passed through as raw tex - , ("ifdim", ifdim) - -- stackengine - , ("addstackgap", skipopts *> tok) - ] - -accent :: PandocMonad m => Char -> Maybe Char -> LP m Inlines -accent combiningAccent fallBack = try $ do - ils <- tok - case toList ils of - (Str (T.uncons -> Just (x, xs)) : ys) -> return $ fromList $ - -- try to normalize to the combined character: - Str (Normalize.normalize Normalize.NFC - (T.pack [x, combiningAccent]) <> xs) : ys - [Space] -> return $ str $ T.singleton $ fromMaybe combiningAccent fallBack - [] -> return $ str $ T.singleton $ fromMaybe combiningAccent fallBack - _ -> return ils - +inlineCommands = M.unions + [ accentCommands tok + , citationCommands inline + , siunitxCommands tok + , acronymCommands + , refCommands + , nameCommands + , verbCommands + , charCommands + , enquoteCommands tok + , inlineLanguageCommands tok + , biblatexInlineCommands tok + , rest ] + where + rest = M.fromList + [ ("emph", extractSpaces emph <$> tok) + , ("textit", extractSpaces emph <$> tok) + , ("textsl", extractSpaces emph <$> tok) + , ("textsc", extractSpaces smallcaps <$> tok) + , ("textsf", extractSpaces (spanWith ("",["sans-serif"],[])) <$> tok) + , ("textmd", extractSpaces (spanWith ("",["medium"],[])) <$> tok) + , ("textrm", extractSpaces (spanWith ("",["roman"],[])) <$> tok) + , ("textup", extractSpaces (spanWith ("",["upright"],[])) <$> tok) + , ("texttt", ttfamily) + , ("sout", extractSpaces strikeout <$> tok) + , ("alert", skipopts >> spanWith ("",["alert"],[]) <$> tok) -- beamer + , ("textsuperscript", extractSpaces superscript <$> tok) + , ("textsubscript", extractSpaces subscript <$> tok) + , ("textbf", extractSpaces strong <$> tok) + , ("textnormal", extractSpaces (spanWith ("",["nodecor"],[])) <$> tok) + , ("underline", underline <$> tok) + , ("mbox", rawInlineOr "mbox" $ processHBox <$> tok) + , ("hbox", rawInlineOr "hbox" $ processHBox <$> tok) + , ("lettrine", rawInlineOr "lettrine" lettrine) + , ("(", mathInline . untokenize <$> manyTill anyTok (controlSeq ")")) + , ("[", mathDisplay . untokenize <$> manyTill anyTok (controlSeq "]")) + , ("ensuremath", mathInline . untokenize <$> braced) + , ("texorpdfstring", const <$> tok <*> tok) + -- old TeX commands + , ("em", extractSpaces emph <$> inlines) + , ("it", extractSpaces emph <$> inlines) + , ("sl", extractSpaces emph <$> inlines) + , ("bf", extractSpaces strong <$> inlines) + , ("tt", code . stringify . toList <$> inlines) + , ("rm", inlines) + , ("itshape", extractSpaces emph <$> inlines) + , ("slshape", extractSpaces emph <$> inlines) + , ("scshape", extractSpaces smallcaps <$> inlines) + , ("bfseries", extractSpaces strong <$> inlines) + , ("MakeUppercase", makeUppercase <$> tok) + , ("MakeTextUppercase", makeUppercase <$> tok) -- textcase + , ("uppercase", makeUppercase <$> tok) + , ("MakeLowercase", makeLowercase <$> tok) + , ("MakeTextLowercase", makeLowercase <$> tok) + , ("lowercase", makeLowercase <$> tok) + , ("thanks", skipopts >> note <$> grouped block) + , ("footnote", skipopts >> note <$> grouped block) + , ("passthrough", tok) -- \passthrough macro used by latex writer + -- for listings + , ("includegraphics", do options <- option [] keyvals + src <- braced + mkImage options . + unescapeURL . + removeDoubleQuotes $ untokenize src) + -- hyperref + , ("url", (\url -> link url "" (str url)) . unescapeURL . untokenize <$> + bracedUrl) + , ("nolinkurl", code . unescapeURL . untokenize <$> bracedUrl) + , ("href", do url <- bracedUrl + sp + link (unescapeURL $ untokenize url) "" <$> tok) + , ("hyperlink", hyperlink) + , ("hyperref", hyperref) + , ("hypertarget", hypertargetInline) + -- hyphenat + , ("nohyphens", tok) + , ("textnhtt", ttfamily) + , ("nhttfamily", ttfamily) + -- LaTeX colors + , ("textcolor", coloredInline "color") + , ("colorbox", coloredInline "background-color") + -- etoolbox + , ("ifstrequal", ifstrequal) + , ("newtoggle", braced >>= newToggle) + , ("toggletrue", braced >>= setToggle True) + , ("togglefalse", braced >>= setToggle False) + , ("iftoggle", try $ ifToggle >> inline) + -- include + , ("input", rawInlineOr "input" $ include "input") + -- soul package + , ("ul", underline <$> tok) + -- ulem package + , ("uline", underline <$> tok) + -- plain tex stuff that should just be passed through as raw tex + , ("ifdim", ifdim) + -- stackengine + , ("addstackgap", skipopts *> tok) + ] lettrine :: PandocMonad m => LP m Inlines lettrine = do - optional opt + optional rawopt x <- tok y <- tok return $ extractSpaces (spanWith ("",["lettrine"],[])) x <> smallcaps y @@ -979,32 +446,18 @@ alterStr :: (Text -> Text) -> Inline -> Inline alterStr f (Str xs) = Str (f xs) alterStr _ x = x -foreignlanguage :: PandocMonad m => LP m Inlines -foreignlanguage = do - babelLang <- untokenize <$> braced - case babelLangToBCP47 babelLang of - Just lang -> spanWith ("", [], [("lang", renderLang lang)]) <$> tok - _ -> tok - -inlineLanguageCommands :: PandocMonad m => M.Map Text (LP m Inlines) -inlineLanguageCommands = M.fromList $ mk <$> M.toList polyglossiaLangToBCP47 - where - mk (polyglossia, bcp47Func) = - ("text" <> polyglossia, inlineLanguage bcp47Func) - -inlineLanguage :: PandocMonad m => (Text -> Lang) -> LP m Inlines -inlineLanguage bcp47Func = do - o <- option "" $ T.filter (\c -> c /= '[' && c /= ']') - <$> rawopt - let lang = renderLang $ bcp47Func o - extractSpaces (spanWith ("", [], [("lang", lang)])) <$> tok - hyperlink :: PandocMonad m => LP m Inlines hyperlink = try $ do src <- untokenize <$> braced lab <- tok return $ link ("#" <> src) "" lab +hyperref :: PandocMonad m => LP m Inlines +hyperref = try $ do + url <- (("#" <>) . untokenize <$> try (sp *> bracketedToks <* sp)) + <|> untokenize <$> (bracedUrl <* bracedUrl <* bracedUrl) + link url "" <$> tok + hypertargetBlock :: PandocMonad m => LP m Blocks hypertargetBlock = try $ do ref <- untokenize <$> braced @@ -1019,31 +472,6 @@ hypertargetInline = try $ do ils <- grouped inline return $ spanWith (ref, [], []) ils -romanNumeralUpper :: (PandocMonad m) => LP m Inlines -romanNumeralUpper = - str . toRomanNumeral <$> romanNumeralArg - -romanNumeralLower :: (PandocMonad m) => LP m Inlines -romanNumeralLower = - str . T.toLower . toRomanNumeral <$> romanNumeralArg - -romanNumeralArg :: (PandocMonad m) => LP m Int -romanNumeralArg = spaces *> (parser <|> inBraces) - where - inBraces = do - symbol '{' - spaces - res <- parser - spaces - symbol '}' - return res - parser = do - Tok _ Word s <- satisfyTok isWordTok - let (digits, rest) = T.span isDigit s - unless (T.null rest) $ - Prelude.fail "Non-digits in argument to \\Rn or \\RN" - safeRead digits - newToggle :: (Monoid a, PandocMonad m) => [Tok] -> LP m a newToggle name = do updateState $ \st -> @@ -1074,9 +502,6 @@ ifToggle = do report $ UndefinedToggle name' pos return () -doTerm :: PandocMonad m => Translations.Term -> LP m Inlines -doTerm term = str <$> translateTerm term - ifstrequal :: (PandocMonad m, Monoid a) => LP m a ifstrequal = do str1 <- tok @@ -1097,13 +522,6 @@ coloredInline stylename = do ttfamily :: PandocMonad m => LP m Inlines ttfamily = code . stringify . toList <$> tok -rawInlineOr :: PandocMonad m => Text -> LP m Inlines -> LP m Inlines -rawInlineOr name' fallback = do - parseRaw <- extensionEnabled Ext_raw_tex <$> getOption readerExtensions - if parseRaw - then rawInline "latex" <$> getRawCommand name' ("\\" <> name') - else fallback - processHBox :: Inlines -> Inlines processHBox = walk convert where @@ -1154,79 +572,90 @@ treatAsInline = Set.fromList , "pagebreak" ] -label :: PandocMonad m => LP m () -label = do - controlSeq "label" - t <- braced - updateState $ \st -> st{ sLastLabel = Just $ untokenize t } - -dolabel :: PandocMonad m => LP m Inlines -dolabel = do - v <- braced - let refstr = untokenize v - updateState $ \st -> - st{ sLastLabel = Just refstr } - return $ spanWith (refstr,[],[("label", refstr)]) - $ inBrackets $ str $ untokenize v - -doref :: PandocMonad m => Text -> LP m Inlines -doref cls = do - v <- braced - let refstr = untokenize v - return $ linkWith ("",[],[ ("reference-type", cls) - , ("reference", refstr)]) - ("#" <> refstr) - "" - (inBrackets $ str refstr) - lookupListDefault :: (Ord k) => v -> [k] -> M.Map k v -> v lookupListDefault d = (fromMaybe d .) . lookupList where lookupList l m = msum $ map (`M.lookup` m) l inline :: PandocMonad m => LP m Inlines -inline = (mempty <$ comment) - <|> (space <$ whitespace) - <|> (softbreak <$ endline) - <|> word - <|> macroDef (rawInline "latex") - <|> inlineCommand' - <|> inlineEnvironment - <|> inlineGroup - <|> (symbol '-' *> - option (str "-") (symbol '-' *> - option (str "–") (str "—" <$ symbol '-'))) - <|> doubleQuote - <|> singleQuote - <|> (str "”" <$ try (symbol '\'' >> symbol '\'')) - <|> (str "”" <$ symbol '”') - <|> (str "’" <$ symbol '\'') - <|> (str "’" <$ symbol '’') - <|> (str "\160" <$ symbol '~') - <|> dollarsMath - <|> (guardEnabled Ext_literate_haskell *> symbol '|' *> doLHSverb) - <|> (str . T.singleton <$> primEscape) - <|> regularSymbol - <|> (do res <- symbolIn "#^'`\"[]&" - pos <- getPosition - let s = untoken res - report $ ParsingUnescaped s pos - return $ str s) +inline = do + Tok pos toktype t <- lookAhead anyTok + let symbolAsString = str . untoken <$> anySymbol + let unescapedSymbolAsString = + do s <- untoken <$> anySymbol + report $ ParsingUnescaped s pos + return $ str s + case toktype of + Comment -> mempty <$ comment + Spaces -> space <$ whitespace + Newline -> softbreak <$ endline + Word -> word + Esc1 -> str . T.singleton <$> primEscape + Esc2 -> str . T.singleton <$> primEscape + Symbol -> + case t of + "-" -> symbol '-' *> + option (str "-") (symbol '-' *> + option (str "–") (str "—" <$ symbol '-')) + "'" -> symbol '\'' *> + option (str "’") (str "”" <$ symbol '\'') + "~" -> str "\160" <$ symbol '~' + "`" -> doubleQuote <|> singleQuote <|> symbolAsString + "\"" -> doubleQuote <|> singleQuote <|> symbolAsString + "“" -> doubleQuote <|> symbolAsString + "‘" -> singleQuote <|> symbolAsString + "$" -> dollarsMath <|> unescapedSymbolAsString + "|" -> (guardEnabled Ext_literate_haskell *> + symbol '|' *> doLHSverb) <|> symbolAsString + "{" -> inlineGroup + "#" -> unescapedSymbolAsString + "&" -> unescapedSymbolAsString + "_" -> unescapedSymbolAsString + "^" -> unescapedSymbolAsString + "\\" -> mzero + "}" -> mzero + _ -> symbolAsString + CtrlSeq _ -> macroDef (rawInline "latex") + <|> inlineCommand' + <|> inlineEnvironment + <|> inlineGroup + _ -> mzero inlines :: PandocMonad m => LP m Inlines inlines = mconcat <$> many inline +opt :: PandocMonad m => LP m Inlines +opt = do + toks <- try (sp *> bracketedToks <* sp) + -- now parse the toks as inlines + st <- getState + parsed <- runParserT (mconcat <$> many inline) st "bracketed option" toks + case parsed of + Right result -> return result + Left e -> throwError $ PandocParsecError (toSources toks) e + -- block elements: preamble :: PandocMonad m => LP m Blocks preamble = mconcat <$> many preambleBlock where preambleBlock = (mempty <$ spaces1) <|> macroDef (rawBlock "latex") + <|> filecontents <|> (mempty <$ blockCommand) <|> (mempty <$ braced) <|> (do notFollowedBy (begin_ "document") anyTok return mempty) +rule :: PandocMonad m => LP m Blocks +rule = do + skipopts + width <- T.takeWhile (\c -> isDigit c || c == '.') . stringify <$> tok + _thickness <- tok + -- 0-width rules are used to fix spacing issues: + case safeRead width of + Just (0 :: Double) -> return mempty + _ -> return horizontalRule + paragraph :: PandocMonad m => LP m Blocks paragraph = do x <- trimInlines . mconcat <$> many1 inline @@ -1264,6 +693,16 @@ include name = do mapM_ (insertIncluded defaultExt) fs return mempty +readFileFromTexinputs :: PandocMonad m => FilePath -> LP m (Maybe Text) +readFileFromTexinputs fp = do + fileContentsMap <- sFileContents <$> getState + case M.lookup (T.pack fp) fileContentsMap of + Just t -> return (Just t) + Nothing -> do + dirs <- map T.unpack . splitTextBy (==':') . fromMaybe "." + <$> lookupEnv "TEXINPUTS" + readFileFromDirs dirs fp + insertIncluded :: PandocMonad m => FilePath -> FilePath @@ -1273,13 +712,12 @@ insertIncluded defaultExtension f' = do ".tex" -> f' ".sty" -> f' _ -> addExtension f' defaultExtension - dirs <- map T.unpack . splitTextBy (==':') . fromMaybe "." <$> lookupEnv "TEXINPUTS" pos <- getPosition containers <- getIncludeFiles <$> getState when (T.pack f `elem` containers) $ throwError $ PandocParseError $ T.pack $ "Include file loop at " ++ show pos updateState $ addIncludeFile $ T.pack f - mbcontents <- readFileFromDirs dirs f + mbcontents <- readFileFromTexinputs f contents <- case mbcontents of Just s -> return s Nothing -> do @@ -1288,10 +726,6 @@ insertIncluded defaultExtension f' = do getInput >>= setInput . (tokenize f contents ++) updateState dropLatestIncludeFile -addMeta :: PandocMonad m => ToMetaValue a => Text -> a -> LP m () -addMeta field val = updateState $ \st -> - st{ sMeta = addMetaField field val $ sMeta st } - authors :: PandocMonad m => LP m () authors = try $ do bgroup @@ -1300,150 +734,6 @@ authors = try $ do egroup addMeta "author" (map trimInlines auths) -macroDef :: (PandocMonad m, Monoid a) => (Text -> a) -> LP m a -macroDef constructor = do - (_, s) <- withRaw (commandDef <|> environmentDef) - (constructor (untokenize s) <$ - guardDisabled Ext_latex_macros) - <|> return mempty - where commandDef = do - (name, macro') <- newcommand <|> letmacro <|> defmacro - guardDisabled Ext_latex_macros <|> - updateState (\s -> s{ sMacros = M.insert name macro' (sMacros s) }) - environmentDef = do - mbenv <- newenvironment - case mbenv of - Nothing -> return () - Just (name, macro1, macro2) -> - guardDisabled Ext_latex_macros <|> - do updateState $ \s -> s{ sMacros = - M.insert name macro1 (sMacros s) } - updateState $ \s -> s{ sMacros = - M.insert ("end" <> name) macro2 (sMacros s) } - -- @\newenvironment{envname}[n-args][default]{begin}{end}@ - -- is equivalent to - -- @\newcommand{\envname}[n-args][default]{begin}@ - -- @\newcommand{\endenvname}@ - -letmacro :: PandocMonad m => LP m (Text, Macro) -letmacro = do - controlSeq "let" - (name, contents) <- withVerbatimMode $ do - Tok _ (CtrlSeq name) _ <- anyControlSeq - optional $ symbol '=' - spaces - -- we first parse in verbatim mode, and then expand macros, - -- because we don't want \let\foo\bar to turn into - -- \let\foo hello if we have previously \def\bar{hello} - contents <- bracedOrToken - return (name, contents) - contents' <- doMacros' 0 contents - return (name, Macro ExpandWhenDefined [] Nothing contents') - -defmacro :: PandocMonad m => LP m (Text, Macro) -defmacro = try $ - -- we use withVerbatimMode, because macros are to be expanded - -- at point of use, not point of definition - withVerbatimMode $ do - controlSeq "def" - Tok _ (CtrlSeq name) _ <- anyControlSeq - argspecs <- many (argspecArg <|> argspecPattern) - contents <- bracedOrToken - return (name, Macro ExpandWhenUsed argspecs Nothing contents) - -argspecArg :: PandocMonad m => LP m ArgSpec -argspecArg = do - Tok _ (Arg i) _ <- satisfyTok isArgTok - return $ ArgNum i - -argspecPattern :: PandocMonad m => LP m ArgSpec -argspecPattern = - Pattern <$> many1 (satisfyTok (\(Tok _ toktype' txt) -> - (toktype' == Symbol || toktype' == Word) && - (txt /= "{" && txt /= "\\" && txt /= "}"))) - -newcommand :: PandocMonad m => LP m (Text, Macro) -newcommand = do - pos <- getPosition - Tok _ (CtrlSeq mtype) _ <- controlSeq "newcommand" <|> - controlSeq "renewcommand" <|> - controlSeq "providecommand" <|> - controlSeq "DeclareMathOperator" <|> - controlSeq "DeclareRobustCommand" - withVerbatimMode $ do - Tok _ (CtrlSeq name) txt <- do - optional (symbol '*') - anyControlSeq <|> - (symbol '{' *> spaces *> anyControlSeq <* spaces <* symbol '}') - spaces - numargs <- option 0 $ try bracketedNum - let argspecs = map ArgNum [1..numargs] - spaces - optarg <- option Nothing $ Just <$> try bracketedToks - spaces - contents' <- bracedOrToken - let contents = - case mtype of - "DeclareMathOperator" -> - Tok pos (CtrlSeq "mathop") "\\mathop" - : Tok pos Symbol "{" - : Tok pos (CtrlSeq "mathrm") "\\mathrm" - : Tok pos Symbol "{" - : (contents' ++ - [ Tok pos Symbol "}", Tok pos Symbol "}" ]) - _ -> contents' - macros <- sMacros <$> getState - case M.lookup name macros of - Just macro - | mtype == "newcommand" -> do - report $ MacroAlreadyDefined txt pos - return (name, macro) - | mtype == "providecommand" -> return (name, macro) - _ -> return (name, Macro ExpandWhenUsed argspecs optarg contents) - -newenvironment :: PandocMonad m => LP m (Maybe (Text, Macro, Macro)) -newenvironment = do - pos <- getPosition - Tok _ (CtrlSeq mtype) _ <- controlSeq "newenvironment" <|> - controlSeq "renewenvironment" <|> - controlSeq "provideenvironment" - withVerbatimMode $ do - optional $ symbol '*' - spaces - name <- untokenize <$> braced - spaces - numargs <- option 0 $ try bracketedNum - spaces - optarg <- option Nothing $ Just <$> try bracketedToks - let argspecs = map (\i -> ArgNum i) [1..numargs] - startcontents <- spaces >> bracedOrToken - endcontents <- spaces >> bracedOrToken - macros <- sMacros <$> getState - case M.lookup name macros of - Just _ - | mtype == "newenvironment" -> do - report $ MacroAlreadyDefined name pos - return Nothing - | mtype == "provideenvironment" -> - return Nothing - _ -> return $ Just (name, - Macro ExpandWhenUsed argspecs optarg startcontents, - Macro ExpandWhenUsed [] Nothing endcontents) - -bracketedNum :: PandocMonad m => LP m Int -bracketedNum = do - ds <- untokenize <$> bracketedToks - case safeRead ds of - Just i -> return i - _ -> return 0 - -setCaption :: PandocMonad m => LP m () -setCaption = try $ do - skipopts - ils <- tok - optional $ try $ spaces *> label - updateState $ \st -> st{ sCaption = Just ils } - looseItem :: PandocMonad m => LP m Blocks looseItem = do inListItem <- sInListItem <$> getState @@ -1457,10 +747,6 @@ epigraph = do p2 <- grouped block return $ divWith ("", ["epigraph"], []) (p1 <> p2) -resetCaption :: PandocMonad m => LP m () -resetCaption = updateState $ \st -> st{ sCaption = Nothing - , sLastLabel = Nothing } - section :: PandocMonad m => Attr -> Int -> LP m Blocks section (ident, classes, kvs) lvl = do skipopts @@ -1554,7 +840,7 @@ blockCommands = M.fromList , ("address", mempty <$ (skipopts *> tok >>= addMeta "address")) , ("signature", mempty <$ (skipopts *> authors)) , ("date", mempty <$ (skipopts *> tok >>= addMeta "date")) - , ("newtheorem", newtheorem) + , ("newtheorem", newtheorem inline) , ("theoremstyle", theoremstyle) -- KOMA-Script metadata commands , ("extratitle", mempty <$ (skipopts *> tok >>= addMeta "extratitle")) @@ -1598,11 +884,11 @@ blockCommands = M.fromList -- , ("hrule", pure horizontalRule) , ("strut", pure mempty) - , ("rule", skipopts *> tok *> tok $> horizontalRule) + , ("rule", rule) , ("item", looseItem) , ("documentclass", skipopts *> braced *> preamble) , ("centerline", para . trimInlines <$> (skipopts *> tok)) - , ("caption", mempty <$ setCaption) + , ("caption", mempty <$ setCaption inline) , ("bibliography", mempty <$ (skipopts *> braced >>= addMeta "bibliography" . splitBibs . untokenize)) , ("addbibresource", mempty <$ (skipopts *> braced >>= @@ -1640,7 +926,8 @@ blockCommands = M.fromList environments :: PandocMonad m => M.Map Text (LP m Blocks) -environments = M.fromList +environments = M.union (tableEnvironments blocks inline) $ + M.fromList [ ("document", env "document" blocks <* skipMany anyTok) , ("abstract", mempty <$ (env "abstract" blocks >>= addMeta "abstract")) , ("sloppypar", env "sloppypar" blocks) @@ -1654,13 +941,6 @@ environments = M.fromList , ("flushright", divWith ("", ["flushright"], []) <$> env "flushright" blocks) , ("flushleft", divWith ("", ["flushleft"], []) <$> env "flushleft" blocks) , ("landscape", env "landscape" blocks) - , ("longtable", env "longtable" $ - resetCaption *> simpTable "longtable" False >>= addTableCaption) - , ("table", env "table" $ - skipopts *> resetCaption *> blocks >>= addTableCaption) - , ("tabular*", env "tabular*" $ simpTable "tabular*" True) - , ("tabularx", env "tabularx" $ simpTable "tabularx" True) - , ("tabular", env "tabular" $ simpTable "tabular" False) , ("quote", blockQuote <$> env "quote" blocks) , ("quotation", blockQuote <$> env "quotation" blocks) , ("verse", blockQuote <$> env "verse" blocks) @@ -1683,7 +963,7 @@ environments = M.fromList , ("lilypond", rawVerbEnv "lilypond") , ("ly", rawVerbEnv "ly") -- amsthm - , ("proof", proof) + , ("proof", proof blocks opt) -- etoolbox , ("ifstrequal", ifstrequal) , ("newtoggle", braced >>= newToggle) @@ -1692,130 +972,29 @@ environments = M.fromList , ("iftoggle", try $ ifToggle >> block) ] -theoremstyle :: PandocMonad m => LP m Blocks -theoremstyle = do - stylename <- untokenize <$> braced - let mbstyle = case stylename of - "plain" -> Just PlainStyle - "definition" -> Just DefinitionStyle - "remark" -> Just RemarkStyle - _ -> Nothing - case mbstyle of - Nothing -> return () - Just sty -> updateState $ \s -> s{ sLastTheoremStyle = sty } - return mempty - -newtheorem :: PandocMonad m => LP m Blocks -newtheorem = do - number <- option True (False <$ symbol '*' <* sp) +filecontents :: PandocMonad m => LP m Blocks +filecontents = try $ do + controlSeq "begin" name <- untokenize <$> braced - sp - series <- option Nothing $ Just . untokenize <$> bracketedToks - sp - showName <- tok - sp - syncTo <- option Nothing $ Just . untokenize <$> bracketedToks - sty <- sLastTheoremStyle <$> getState - let spec = TheoremSpec { theoremName = showName - , theoremStyle = sty - , theoremSeries = series - , theoremSyncTo = syncTo - , theoremNumber = number - , theoremLastNum = DottedNum [0] } - tmap <- sTheoremMap <$> getState - updateState $ \s -> s{ sTheoremMap = - M.insert name spec tmap } + guard $ name == "filecontents" || name == "filecontents*" + skipopts + fp <- untokenize <$> braced + txt <- verbEnv name + updateState $ \st -> + st{ sFileContents = M.insert fp txt (sFileContents st) } return mempty -proof :: PandocMonad m => LP m Blocks -proof = do - title <- option (B.text "Proof") opt - bs <- env "proof" blocks - return $ - B.divWith ("", ["proof"], []) $ - addQed $ addTitle (B.emph (title <> ".")) bs - -addTitle :: Inlines -> Blocks -> Blocks -addTitle ils bs = - case B.toList bs of - (Para xs : rest) - -> B.fromList (Para (B.toList ils ++ (Space : xs)) : rest) - _ -> B.para ils <> bs - -addQed :: Blocks -> Blocks -addQed bs = - case Seq.viewr (B.unMany bs) of - s Seq.:> Para ils - -> B.Many (s Seq.|> Para (ils ++ B.toList qedSign)) - _ -> bs <> B.para qedSign - where - qedSign = B.str "\xa0\x25FB" - environment :: PandocMonad m => LP m Blocks environment = try $ do controlSeq "begin" name <- untokenize <$> braced M.findWithDefault mzero name environments <|> - theoremEnvironment name <|> + theoremEnvironment blocks opt name <|> if M.member name (inlineEnvironments :: M.Map Text (LP PandocPure Inlines)) then mzero else try (rawEnv name) <|> rawVerbEnv name -theoremEnvironment :: PandocMonad m => Text -> LP m Blocks -theoremEnvironment name = do - tmap <- sTheoremMap <$> getState - case M.lookup name tmap of - Nothing -> mzero - Just tspec -> do - optTitle <- option mempty $ (\x -> space <> "(" <> x <> ")") <$> opt - mblabel <- option Nothing $ Just . untokenize <$> - try (spaces >> controlSeq "label" >> spaces >> braced) - bs <- env name blocks - number <- - if theoremNumber tspec - then do - let name' = fromMaybe name $ theoremSeries tspec - num <- getNextNumber - (maybe (DottedNum [0]) theoremLastNum . - M.lookup name' . sTheoremMap) - updateState $ \s -> - s{ sTheoremMap = - M.adjust - (\spec -> spec{ theoremLastNum = num }) - name' - (sTheoremMap s) - } - - case mblabel of - Just ident -> - updateState $ \s -> - s{ sLabels = M.insert ident - (B.toList $ - theoremName tspec <> "\160" <> - str (renderDottedNum num)) (sLabels s) } - Nothing -> return () - return $ space <> B.text (renderDottedNum num) - else return mempty - let titleEmph = case theoremStyle tspec of - PlainStyle -> B.strong - DefinitionStyle -> B.strong - RemarkStyle -> B.emph - let title = titleEmph (theoremName tspec <> number) - <> optTitle <> "." <> space - return $ divWith (fromMaybe "" mblabel, [name], []) $ addTitle title - $ case theoremStyle tspec of - PlainStyle -> walk italicize bs - _ -> bs - -italicize :: Block -> Block -italicize (Para ils) = Para [Emph ils] -italicize (Plain ils) = Plain [Emph ils] -italicize x = x - -env :: PandocMonad m => Text -> LP m a -> LP m a -env name p = p <* end_ name - rawEnv :: PandocMonad m => Text -> LP m Blocks rawEnv name = do exts <- getOption readerExtensions @@ -1823,15 +1002,17 @@ rawEnv name = do rawOptions <- mconcat <$> many rawopt let beginCommand = "\\begin{" <> name <> "}" <> rawOptions pos1 <- getPosition - (bs, raw) <- withRaw $ env name blocks if parseRaw - then return $ rawBlock "latex" + then do + (_, raw) <- withRaw $ env name blocks + return $ rawBlock "latex" $ beginCommand <> untokenize raw else do + bs <- env name blocks report $ SkippedContent beginCommand pos1 pos2 <- getPosition report $ SkippedContent ("\\end{" <> name <> "}") pos2 - return bs + return $ divWith ("",[name],[]) bs rawVerbEnv :: PandocMonad m => Text -> LP m Blocks rawVerbEnv name = do @@ -1890,8 +1071,7 @@ inputMinted = do pos <- getPosition attr <- mintedAttr f <- T.filter (/='"') . untokenize <$> braced - dirs <- map T.unpack . splitTextBy (==':') . fromMaybe "." <$> lookupEnv "TEXINPUTS" - mbCode <- readFileFromDirs dirs (T.unpack f) + mbCode <- readFileFromTexinputs (T.unpack f) rawcode <- case mbCode of Just s -> return s Nothing -> do @@ -1989,8 +1169,7 @@ inputListing = do pos <- getPosition options <- option [] keyvals f <- T.filter (/='"') . untokenize <$> braced - dirs <- map T.unpack . splitTextBy (==':') . fromMaybe "." <$> lookupEnv "TEXINPUTS" - mbCode <- readFileFromDirs dirs (T.unpack f) + mbCode <- readFileFromTexinputs (T.unpack f) codeLines <- case mbCode of Just s -> return $ T.lines s Nothing -> do @@ -1999,7 +1178,8 @@ inputListing = do let (ident,classes,kvs) = parseListingsOptions options let classes' = (case listingsLanguage options of - Nothing -> (take 1 (languagesByExtension (T.pack $ takeExtension $ T.unpack f)) <>) + Nothing -> (take 1 (languagesByExtension defaultSyntaxMap + (T.pack $ takeExtension $ T.unpack f)) <>) Just _ -> id) classes let firstline = fromMaybe 1 $ lookup "firstline" options >>= safeRead let lastline = fromMaybe (length codeLines) $ @@ -2065,358 +1245,23 @@ orderedList' = try $ do bs <- listenv "enumerate" (many item) return $ orderedListWith (start, style, delim) bs --- tables - -hline :: PandocMonad m => LP m () -hline = try $ do - spaces - controlSeq "hline" <|> - -- booktabs rules: - controlSeq "toprule" <|> - controlSeq "bottomrule" <|> - controlSeq "midrule" <|> - controlSeq "endhead" <|> - controlSeq "endfirsthead" - spaces - optional opt - return () - -lbreak :: PandocMonad m => LP m Tok -lbreak = (controlSeq "\\" <|> controlSeq "tabularnewline") - <* skipopts <* spaces - -amp :: PandocMonad m => LP m Tok -amp = symbol '&' - --- Split a Word into individual Symbols (for parseAligns) -splitWordTok :: PandocMonad m => LP m () -splitWordTok = do - inp <- getInput - case inp of - (Tok spos Word t : rest) -> - setInput $ map (Tok spos Symbol . T.singleton) (T.unpack t) <> rest - _ -> return () - -parseAligns :: PandocMonad m => LP m [(Alignment, ColWidth, ([Tok], [Tok]))] -parseAligns = try $ do - let maybeBar = skipMany - (try $ sp *> (() <$ symbol '|' <|> () <$ (symbol '@' >> braced))) - let cAlign = AlignCenter <$ symbol 'c' - let lAlign = AlignLeft <$ symbol 'l' - let rAlign = AlignRight <$ symbol 'r' - let parAlign = AlignLeft <$ symbol 'p' - -- aligns from tabularx - let xAlign = AlignLeft <$ symbol 'X' - let mAlign = AlignLeft <$ symbol 'm' - let bAlign = AlignLeft <$ symbol 'b' - let alignChar = splitWordTok *> ( cAlign <|> lAlign <|> rAlign <|> parAlign - <|> xAlign <|> mAlign <|> bAlign ) - let alignPrefix = symbol '>' >> braced - let alignSuffix = symbol '<' >> braced - let colWidth = try $ do - symbol '{' - ds <- trim . untokenize <$> manyTill anyTok (controlSeq "linewidth") - spaces - symbol '}' - return $ safeRead ds - let alignSpec = do - pref <- option [] alignPrefix - spaces - al <- alignChar - width <- colWidth <|> option Nothing (do s <- untokenize <$> braced - pos <- getPosition - report $ SkippedContent s pos - return Nothing) - spaces - suff <- option [] alignSuffix - return (al, width, (pref, suff)) - let starAlign = do -- '*{2}{r}' == 'rr', we just expand like a macro - symbol '*' - spaces - ds <- trim . untokenize <$> braced - spaces - spec <- braced - case safeRead ds of - Just n -> - getInput >>= setInput . (mconcat (replicate n spec) ++) - Nothing -> Prelude.fail $ "Could not parse " <> T.unpack ds <> " as number" - bgroup - spaces - maybeBar - aligns' <- many $ try $ spaces >> optional starAlign >> - (alignSpec <* maybeBar) - spaces - egroup - spaces - return $ map toSpec aligns' - where - toColWidth (Just w) | w > 0 = ColWidth w - toColWidth _ = ColWidthDefault - toSpec (x, y, z) = (x, toColWidth y, z) - --- N.B. this parser returns a Row that may have erroneous empty cells --- in it. See the note above fixTableHead for details. -parseTableRow :: PandocMonad m - => Text -- ^ table environment name - -> [([Tok], [Tok])] -- ^ pref/suffixes - -> LP m Row -parseTableRow envname prefsufs = do - notFollowedBy (spaces *> end_ envname) - -- add prefixes and suffixes in token stream: - let celltoks (pref, suff) = do - prefpos <- getPosition - contents <- mconcat <$> - many ( snd <$> withRaw (controlSeq "parbox" >> parbox) -- #5711 - <|> - snd <$> withRaw (inlineEnvironment <|> dollarsMath) - <|> - (do notFollowedBy - (() <$ amp <|> () <$ lbreak <|> end_ envname) - count 1 anyTok) ) - - suffpos <- getPosition - option [] (count 1 amp) - return $ map (setpos prefpos) pref ++ contents ++ map (setpos suffpos) suff - rawcells <- mapM celltoks prefsufs - oldInput <- getInput - cells <- mapM (\ts -> setInput ts >> parseTableCell) rawcells - setInput oldInput - spaces - return $ Row nullAttr cells - -parseTableCell :: PandocMonad m => LP m Cell -parseTableCell = do - spaces - updateState $ \st -> st{ sInTableCell = True } - cell' <- multicolumnCell - <|> multirowCell - <|> parseSimpleCell - <|> parseEmptyCell - updateState $ \st -> st{ sInTableCell = False } - spaces - return cell' - where - -- The parsing of empty cells is important in LaTeX, especially when dealing - -- with multirow/multicolumn. See #6603. - parseEmptyCell = spaces $> emptyCell - -cellAlignment :: PandocMonad m => LP m Alignment -cellAlignment = skipMany (symbol '|') *> alignment <* skipMany (symbol '|') - where - alignment = do - c <- untoken <$> singleChar - return $ case c of - "l" -> AlignLeft - "r" -> AlignRight - "c" -> AlignCenter - "*" -> AlignDefault - _ -> AlignDefault - -plainify :: Blocks -> Blocks -plainify bs = case toList bs of - [Para ils] -> plain (fromList ils) - _ -> bs - -multirowCell :: PandocMonad m => LP m Cell -multirowCell = controlSeq "multirow" >> do - -- Full prototype for \multirow macro is: - -- \multirow[vpos]{nrows}[bigstruts]{width}[vmove]{text} - -- However, everything except `nrows` and `text` make - -- sense in the context of the Pandoc AST - _ <- optional $ symbol '[' *> cellAlignment <* symbol ']' -- vertical position - nrows <- fmap (fromMaybe 1 . safeRead . untokenize) braced - _ <- optional $ symbol '[' *> manyTill anyTok (symbol ']') -- bigstrut-related - _ <- symbol '{' *> manyTill anyTok (symbol '}') -- Cell width - _ <- optional $ symbol '[' *> manyTill anyTok (symbol ']') -- Length used for fine-tuning - content <- symbol '{' *> (plainify <$> blocks) <* symbol '}' - return $ cell AlignDefault (RowSpan nrows) (ColSpan 1) content - -multicolumnCell :: PandocMonad m => LP m Cell -multicolumnCell = controlSeq "multicolumn" >> do - span' <- fmap (fromMaybe 1 . safeRead . untokenize) braced - alignment <- symbol '{' *> cellAlignment <* symbol '}' - - let singleCell = do - content <- plainify <$> blocks - return $ cell alignment (RowSpan 1) (ColSpan span') content - - -- Two possible contents: either a \multirow cell, or content. - -- E.g. \multicol{1}{c}{\multirow{2}{1em}{content}} - -- Note that a \multirow cell can be nested in a \multicolumn, - -- but not the other way around. See #6603 - let nestedCell = do - (Cell _ _ (RowSpan rs) _ bs) <- multirowCell - return $ cell - alignment - (RowSpan rs) - (ColSpan span') - (fromList bs) - - symbol '{' *> (nestedCell <|> singleCell) <* symbol '}' - --- Parse a simple cell, i.e. not multirow/multicol -parseSimpleCell :: PandocMonad m => LP m Cell -parseSimpleCell = simpleCell <$> (plainify <$> blocks) - --- LaTeX tables are stored with empty cells underneath multirow cells --- denoting the grid spaces taken up by them. More specifically, if a --- cell spans m rows, then it will overwrite all the cells in the --- columns it spans for (m-1) rows underneath it, requiring padding --- cells in these places. These padding cells need to be removed for --- proper table reading. See #6603. --- --- These fixTable functions do not otherwise fix up malformed --- input tables: that is left to the table builder. -fixTableHead :: TableHead -> TableHead -fixTableHead (TableHead attr rows) = TableHead attr rows' - where - rows' = fixTableRows rows - -fixTableBody :: TableBody -> TableBody -fixTableBody (TableBody attr rhc th tb) - = TableBody attr rhc th' tb' - where - th' = fixTableRows th - tb' = fixTableRows tb - -fixTableRows :: [Row] -> [Row] -fixTableRows = fixTableRows' $ repeat Nothing - where - fixTableRows' oldHang (Row attr cells : rs) - = let (newHang, cells') = fixTableRow oldHang cells - rs' = fixTableRows' newHang rs - in Row attr cells' : rs' - fixTableRows' _ [] = [] - --- The overhang is represented as Just (relative cell dimensions) or --- Nothing for an empty grid space. -fixTableRow :: [Maybe (ColSpan, RowSpan)] -> [Cell] -> ([Maybe (ColSpan, RowSpan)], [Cell]) -fixTableRow oldHang cells - -- If there's overhang, drop cells until their total width meets the - -- width of the occupied grid spaces (or we run out) - | (n, prefHang, restHang) <- splitHang oldHang - , n > 0 - = let cells' = dropToWidth getCellW n cells - (restHang', cells'') = fixTableRow restHang cells' - in (prefHang restHang', cells'') - -- Otherwise record the overhang of a pending cell and fix the rest - -- of the row - | c@(Cell _ _ h w _):cells' <- cells - = let h' = max 1 h - w' = max 1 w - oldHang' = dropToWidth getHangW w' oldHang - (newHang, cells'') = fixTableRow oldHang' cells' - in (toHang w' h' <> newHang, c : cells'') - | otherwise - = (oldHang, []) - where - getCellW (Cell _ _ _ w _) = w - getHangW = maybe 1 fst - getCS (ColSpan n) = n - - toHang c r - | r > 1 = [Just (c, r)] - | otherwise = replicate (getCS c) Nothing - - -- Take the prefix of the overhang list representing filled grid - -- spaces. Also return the remainder and the length of this prefix. - splitHang = splitHang' 0 id - - splitHang' !n l (Just (c, r):xs) - = splitHang' (n + c) (l . (toHang c (r-1) ++)) xs - splitHang' n l xs = (n, l, xs) - - -- Drop list items until the total width of the dropped items - -- exceeds the passed width. - dropToWidth _ n l | n < 1 = l - dropToWidth wproj n (c:cs) = dropToWidth wproj (n - wproj c) cs - dropToWidth _ _ [] = [] - -simpTable :: PandocMonad m => Text -> Bool -> LP m Blocks -simpTable envname hasWidthParameter = try $ do - when hasWidthParameter $ () <$ (spaces >> tok) - skipopts - colspecs <- parseAligns - let (aligns, widths, prefsufs) = unzip3 colspecs - optional $ controlSeq "caption" *> setCaption - spaces - optional label - spaces - optional lbreak - spaces - skipMany hline - spaces - header' <- option [] . try . fmap (:[]) $ - parseTableRow envname prefsufs <* lbreak <* many1 hline - spaces - rows <- sepEndBy (parseTableRow envname prefsufs) - (lbreak <* optional (skipMany hline)) - spaces - optional $ controlSeq "caption" *> setCaption - spaces - optional label - spaces - optional lbreak - spaces - lookAhead $ controlSeq "end" -- make sure we're at end - let th = fixTableHead $ TableHead nullAttr header' - let tbs = [fixTableBody $ TableBody nullAttr 0 [] rows] - let tf = TableFoot nullAttr [] - return $ table emptyCaption (zip aligns widths) th tbs tf - -addTableCaption :: PandocMonad m => Blocks -> LP m Blocks -addTableCaption = walkM go - where go (Table attr c spec th tb tf) = do - st <- getState - let mblabel = sLastLabel st - capt <- case (sCaption st, mblabel) of - (Just ils, Nothing) -> return $ caption Nothing (plain ils) - (Just ils, Just lab) -> do - num <- getNextNumber sLastTableNum - setState - st{ sLastTableNum = num - , sLabels = M.insert lab - [Str (renderDottedNum num)] - (sLabels st) } - return $ caption Nothing (plain ils) -- add number?? - (Nothing, _) -> return c - let attr' = case (attr, mblabel) of - ((_,classes,kvs), Just ident) -> - (ident,classes,kvs) - _ -> attr - return $ addAttrDiv attr' $ Table nullAttr capt spec th tb tf - go x = return x - --- TODO: For now we add a Div to contain table attributes, since --- most writers don't do anything yet with attributes on Table. --- This can be removed when that changes. -addAttrDiv :: Attr -> Block -> Block -addAttrDiv ("",[],[]) b = b -addAttrDiv attr b = Div attr [b] - block :: PandocMonad m => LP m Blocks block = do - res <- (mempty <$ spaces1) - <|> environment - <|> macroDef (rawBlock "latex") - <|> blockCommand - <|> paragraph - <|> grouped block + Tok _ toktype _ <- lookAhead anyTok + res <- (case toktype of + Newline -> mempty <$ spaces1 + Spaces -> mempty <$ spaces1 + Comment -> mempty <$ spaces1 + Word -> paragraph + CtrlSeq "begin" -> environment + CtrlSeq _ -> macroDef (rawBlock "latex") + <|> blockCommand + _ -> mzero) + <|> paragraph + <|> grouped block trace (T.take 60 $ tshow $ B.toList res) return res blocks :: PandocMonad m => LP m Blocks blocks = mconcat <$> many block -setDefaultLanguage :: PandocMonad m => LP m Blocks -setDefaultLanguage = do - o <- option "" $ T.filter (\c -> c /= '[' && c /= ']') - <$> rawopt - polylang <- untokenize <$> braced - case M.lookup polylang polyglossiaLangToBCP47 of - Nothing -> return mempty -- TODO mzero? warning? - Just langFunc -> do - let l = langFunc o - setTranslations l - updateState $ setMeta "lang" $ str (renderLang l) - return mempty diff --git a/src/Text/Pandoc/Readers/LaTeX/Citation.hs b/src/Text/Pandoc/Readers/LaTeX/Citation.hs new file mode 100644 index 000000000..af97125c6 --- /dev/null +++ b/src/Text/Pandoc/Readers/LaTeX/Citation.hs @@ -0,0 +1,210 @@ +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE OverloadedStrings #-} +module Text.Pandoc.Readers.LaTeX.Citation + ( citationCommands + , cites + ) +where + +import Text.Pandoc.Class +import Text.Pandoc.Readers.LaTeX.Parsing +import Text.Pandoc.Builder as B +import qualified Data.Map as M +import Data.Text (Text) +import Control.Applicative ((<|>), optional, many) +import Control.Monad (mzero) +import Control.Monad.Trans (lift) +import Control.Monad.Except (throwError) +import Text.Pandoc.Error (PandocError(PandocParsecError)) +import Text.Pandoc.Parsing hiding (blankline, many, mathDisplay, mathInline, + optional, space, spaces, withRaw, (<|>)) + +citationCommands :: PandocMonad m => LP m Inlines -> M.Map Text (LP m Inlines) +citationCommands inline = + let citation = citationWith inline + tok = spaces *> grouped inline + in M.fromList + [ ("cite", citation "cite" NormalCitation False) + , ("Cite", citation "Cite" NormalCitation False) + , ("citep", citation "citep" NormalCitation False) + , ("citep*", citation "citep*" NormalCitation False) + , ("citeal", citation "citeal" NormalCitation False) + , ("citealp", citation "citealp" NormalCitation False) + , ("citealp*", citation "citealp*" NormalCitation False) + , ("autocite", citation "autocite" NormalCitation False) + , ("smartcite", citation "smartcite" NormalCitation False) + , ("footcite", inNote <$> citation "footcite" NormalCitation False) + , ("parencite", citation "parencite" NormalCitation False) + , ("supercite", citation "supercite" NormalCitation False) + , ("footcitetext", inNote <$> citation "footcitetext" NormalCitation False) + , ("citeyearpar", citation "citeyearpar" SuppressAuthor False) + , ("citeyear", citation "citeyear" SuppressAuthor False) + , ("autocite*", citation "autocite*" SuppressAuthor False) + , ("cite*", citation "cite*" SuppressAuthor False) + , ("parencite*", citation "parencite*" SuppressAuthor False) + , ("textcite", citation "textcite" AuthorInText False) + , ("citet", citation "citet" AuthorInText False) + , ("citet*", citation "citet*" AuthorInText False) + , ("citealt", citation "citealt" AuthorInText False) + , ("citealt*", citation "citealt*" AuthorInText False) + , ("textcites", citation "textcites" AuthorInText True) + , ("cites", citation "cites" NormalCitation True) + , ("autocites", citation "autocites" NormalCitation True) + , ("footcites", inNote <$> citation "footcites" NormalCitation True) + , ("parencites", citation "parencites" NormalCitation True) + , ("supercites", citation "supercites" NormalCitation True) + , ("footcitetexts", inNote <$> citation "footcitetexts" NormalCitation True) + , ("Autocite", citation "Autocite" NormalCitation False) + , ("Smartcite", citation "Smartcite" NormalCitation False) + , ("Footcite", inNote <$> citation "Footcite" NormalCitation False) + , ("Parencite", citation "Parencite" NormalCitation False) + , ("Supercite", citation "Supercite" NormalCitation False) + , ("Footcitetext", inNote <$> citation "Footcitetext" NormalCitation False) + , ("Citeyearpar", citation "Citeyearpar" SuppressAuthor False) + , ("Citeyear", citation "Citeyear" SuppressAuthor False) + , ("Autocite*", citation "Autocite*" SuppressAuthor False) + , ("Cite*", citation "Cite*" SuppressAuthor False) + , ("Parencite*", citation "Parencite*" SuppressAuthor False) + , ("Textcite", citation "Textcite" AuthorInText False) + , ("Textcites", citation "Textcites" AuthorInText True) + , ("Cites", citation "Cites" NormalCitation True) + , ("Autocites", citation "Autocites" NormalCitation True) + , ("Footcites", inNote <$> citation "Footcites" NormalCitation True) + , ("Parencites", citation "Parencites" NormalCitation True) + , ("Supercites", citation "Supercites" NormalCitation True) + , ("Footcitetexts", inNote <$> citation "Footcitetexts" NormalCitation True) + , ("citetext", complexNatbibCitation inline NormalCitation) + , ("citeauthor", (try (tok *> sp *> controlSeq "citetext") *> + complexNatbibCitation inline AuthorInText) + <|> citation "citeauthor" AuthorInText False) + , ("nocite", mempty <$ (citation "nocite" NormalCitation False >>= + addMeta "nocite")) + ] + +-- citations + +addPrefix :: [Inline] -> [Citation] -> [Citation] +addPrefix p (k:ks) = k {citationPrefix = p ++ citationPrefix k} : ks +addPrefix _ _ = [] + +addSuffix :: [Inline] -> [Citation] -> [Citation] +addSuffix s ks@(_:_) = + let k = last ks + in init ks ++ [k {citationSuffix = citationSuffix k ++ s}] +addSuffix _ _ = [] + +simpleCiteArgs :: forall m . PandocMonad m => LP m Inlines -> LP m [Citation] +simpleCiteArgs inline = try $ do + first <- optionMaybe $ toList <$> opt + second <- optionMaybe $ toList <$> opt + keys <- try $ bgroup *> manyTill citationLabel egroup + let (pre, suf) = case (first , second ) of + (Just s , Nothing) -> (mempty, s ) + (Just s , Just t ) -> (s , t ) + _ -> (mempty, mempty) + conv k = Citation { citationId = k + , citationPrefix = [] + , citationSuffix = [] + , citationMode = NormalCitation + , citationHash = 0 + , citationNoteNum = 0 + } + return $ addPrefix pre $ addSuffix suf $ map conv keys + where + opt :: PandocMonad m => LP m Inlines + opt = do + toks <- try (sp *> bracketedToks <* sp) + -- now parse the toks as inlines + st <- getState + parsed <- lift $ + runParserT (mconcat <$> many inline) st "bracketed option" toks + case parsed of + Right result -> return result + Left e -> throwError $ PandocParsecError (toSources toks) e + + + +citationLabel :: PandocMonad m => LP m Text +citationLabel = do + sp + untokenize <$> + (many1 (satisfyTok isWordTok <|> symbolIn bibtexKeyChar) + <* sp + <* optional (symbol ',') + <* sp) + where bibtexKeyChar = ".:;?!`'()/*@_+=-&[]" :: [Char] + +cites :: PandocMonad m + => LP m Inlines -> CitationMode -> Bool -> LP m [Citation] +cites inline mode multi = try $ do + let paropt = parenWrapped inline + cits <- if multi + then do + multiprenote <- optionMaybe $ toList <$> paropt + multipostnote <- optionMaybe $ toList <$> paropt + let (pre, suf) = case (multiprenote, multipostnote) of + (Just s , Nothing) -> (mempty, s) + (Nothing , Just t) -> (mempty, t) + (Just s , Just t ) -> (s, t) + _ -> (mempty, mempty) + tempCits <- many1 $ simpleCiteArgs inline + case tempCits of + (k:ks) -> case ks of + (_:_) -> return $ (addMprenote pre k : init ks) ++ + [addMpostnote suf (last ks)] + _ -> return [addMprenote pre (addMpostnote suf k)] + _ -> return [[]] + else count 1 $ simpleCiteArgs inline + let cs = concat cits + return $ case mode of + AuthorInText -> case cs of + (c:rest) -> c {citationMode = mode} : rest + [] -> [] + _ -> map (\a -> a {citationMode = mode}) cs + where mprenote (k:ks) = (k:ks) ++ [Space] + mprenote _ = mempty + mpostnote (k:ks) = [Str ",", Space] ++ (k:ks) + mpostnote _ = mempty + addMprenote mpn (k:ks) = + let mpnfinal = case citationPrefix k of + (_:_) -> mprenote mpn + _ -> mpn + in addPrefix mpnfinal (k:ks) + addMprenote _ _ = [] + addMpostnote = addSuffix . mpostnote + +citationWith :: PandocMonad m + => LP m Inlines -> Text -> CitationMode -> Bool -> LP m Inlines +citationWith inline name mode multi = do + (c,raw) <- withRaw $ cites inline mode multi + return $ cite c (rawInline "latex" $ "\\" <> name <> untokenize raw) + +handleCitationPart :: Inlines -> [Citation] +handleCitationPart ils = + let isCite Cite{} = True + isCite _ = False + (pref, rest) = break isCite (toList ils) + in case rest of + (Cite cs _:suff) -> addPrefix pref $ addSuffix suff cs + _ -> [] + +complexNatbibCitation :: PandocMonad m + => LP m Inlines -> CitationMode -> LP m Inlines +complexNatbibCitation inline mode = try $ do + (cs, raw) <- + withRaw $ concat <$> do + bgroup + items <- mconcat <$> + many1 (notFollowedBy (symbol ';') >> inline) + `sepBy1` symbol ';' + egroup + return $ map handleCitationPart items + case cs of + [] -> mzero + (c:cits) -> return $ cite (c{ citationMode = mode }:cits) + (rawInline "latex" $ "\\citetext" <> untokenize raw) + +inNote :: Inlines -> Inlines +inNote ils = + note $ para $ ils <> str "." + diff --git a/src/Text/Pandoc/Readers/LaTeX/Inline.hs b/src/Text/Pandoc/Readers/LaTeX/Inline.hs new file mode 100644 index 000000000..7b8bca4af --- /dev/null +++ b/src/Text/Pandoc/Readers/LaTeX/Inline.hs @@ -0,0 +1,397 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ViewPatterns #-} +{- | + Module : Text.Pandoc.Readers.LaTeX.Inline + Copyright : Copyright (C) 2006-2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable +-} +module Text.Pandoc.Readers.LaTeX.Inline + ( acronymCommands + , verbCommands + , charCommands + , accentCommands + , nameCommands + , biblatexInlineCommands + , refCommands + , rawInlineOr + , listingsLanguage + ) +where + +import qualified Data.Map as M +import Data.Text (Text) +import qualified Data.Text as T +import Text.Pandoc.Builder +import Text.Pandoc.Shared (toRomanNumeral, safeRead) +import Text.Pandoc.Readers.LaTeX.Types (Tok (..), TokType (..)) +import Control.Applicative (optional, (<|>)) +import Control.Monad (guard, mzero, mplus, unless) +import Text.Pandoc.Class.PandocMonad (PandocMonad (..), translateTerm) +import Text.Pandoc.Readers.LaTeX.Parsing +import Text.Pandoc.Extensions (extensionEnabled, Extension(..)) +import Text.Pandoc.Parsing (getOption, updateState, getState, notFollowedBy, + manyTill, getInput, setInput, incSourceColumn, + option, many1, try) +import Data.Char (isDigit) +import Text.Pandoc.Highlighting (fromListingsLanguage,) +import Data.Maybe (maybeToList, fromMaybe) +import Text.Pandoc.Options (ReaderOptions(..)) +import qualified Data.Text.Normalize as Normalize +import qualified Text.Pandoc.Translations as Translations + +rawInlineOr :: PandocMonad m => Text -> LP m Inlines -> LP m Inlines +rawInlineOr name' fallback = do + parseRaw <- extensionEnabled Ext_raw_tex <$> getOption readerExtensions + if parseRaw + then rawInline "latex" <$> getRawCommand name' ("\\" <> name') + else fallback + +dolabel :: PandocMonad m => LP m Inlines +dolabel = do + v <- braced + let refstr = untokenize v + updateState $ \st -> + st{ sLastLabel = Just refstr } + return $ spanWith (refstr,[],[("label", refstr)]) + $ inBrackets $ str $ untokenize v + +doref :: PandocMonad m => Text -> LP m Inlines +doref cls = do + v <- braced + let refstr = untokenize v + return $ linkWith ("",[],[ ("reference-type", cls) + , ("reference", refstr)]) + ("#" <> refstr) + "" + (inBrackets $ str refstr) + +inBrackets :: Inlines -> Inlines +inBrackets x = str "[" <> x <> str "]" + +doTerm :: PandocMonad m => Translations.Term -> LP m Inlines +doTerm term = str <$> translateTerm term + +lit :: Text -> LP m Inlines +lit = pure . str + +doverb :: PandocMonad m => LP m Inlines +doverb = do + Tok _ Symbol t <- anySymbol + marker <- case T.uncons t of + Just (c, ts) | T.null ts -> return c + _ -> mzero + withVerbatimMode $ + code . untokenize <$> + manyTill (notFollowedBy newlineTok >> verbTok marker) (symbol marker) + +verbTok :: PandocMonad m => Char -> LP m Tok +verbTok stopchar = do + t@(Tok pos toktype txt) <- anyTok + case T.findIndex (== stopchar) txt of + Nothing -> return t + Just i -> do + let (t1, t2) = T.splitAt i txt + inp <- getInput + setInput $ Tok (incSourceColumn pos i) Symbol (T.singleton stopchar) + : totoks (incSourceColumn pos (i + 1)) (T.drop 1 t2) ++ inp + return $ Tok pos toktype t1 + +listingsLanguage :: [(Text, Text)] -> Maybe Text +listingsLanguage opts = + case lookup "language" opts of + Nothing -> Nothing + Just l -> fromListingsLanguage l `mplus` Just l + +dolstinline :: PandocMonad m => LP m Inlines +dolstinline = do + options <- option [] keyvals + let classes = maybeToList $ listingsLanguage options + doinlinecode classes + +domintinline :: PandocMonad m => LP m Inlines +domintinline = do + skipopts + cls <- untokenize <$> braced + doinlinecode [cls] + +doinlinecode :: PandocMonad m => [Text] -> LP m Inlines +doinlinecode classes = do + Tok _ Symbol t <- anySymbol + marker <- case T.uncons t of + Just (c, ts) | T.null ts -> return c + _ -> mzero + let stopchar = if marker == '{' then '}' else marker + withVerbatimMode $ + codeWith ("",classes,[]) . T.map nlToSpace . untokenize <$> + manyTill (verbTok stopchar) (symbol stopchar) + +nlToSpace :: Char -> Char +nlToSpace '\n' = ' ' +nlToSpace x = x + +romanNumeralUpper :: (PandocMonad m) => LP m Inlines +romanNumeralUpper = + str . toRomanNumeral <$> romanNumeralArg + +romanNumeralLower :: (PandocMonad m) => LP m Inlines +romanNumeralLower = + str . T.toLower . toRomanNumeral <$> romanNumeralArg + +romanNumeralArg :: (PandocMonad m) => LP m Int +romanNumeralArg = spaces *> (parser <|> inBraces) + where + inBraces = do + symbol '{' + spaces + res <- parser + spaces + symbol '}' + return res + parser = do + s <- untokenize <$> many1 (satisfyTok isWordTok) + let (digits, rest) = T.span isDigit s + unless (T.null rest) $ + Prelude.fail "Non-digits in argument to \\Rn or \\RN" + safeRead digits + +accentWith :: PandocMonad m + => LP m Inlines -> Char -> Maybe Char -> LP m Inlines +accentWith tok combiningAccent fallBack = try $ do + ils <- tok + case toList ils of + (Str (T.uncons -> Just (x, xs)) : ys) -> return $ fromList $ + -- try to normalize to the combined character: + Str (Normalize.normalize Normalize.NFC + (T.pack [x, combiningAccent]) <> xs) : ys + [Space] -> return $ str $ T.singleton + $ fromMaybe combiningAccent fallBack + [] -> return $ str $ T.singleton + $ fromMaybe combiningAccent fallBack + _ -> return ils + + +verbCommands :: PandocMonad m => M.Map Text (LP m Inlines) +verbCommands = M.fromList + [ ("verb", doverb) + , ("lstinline", dolstinline) + , ("mintinline", domintinline) + , ("Verb", doverb) + ] + +accentCommands :: PandocMonad m => LP m Inlines -> M.Map Text (LP m Inlines) +accentCommands tok = + let accent = accentWith tok + in M.fromList + [ ("aa", lit "å") + , ("AA", lit "Å") + , ("ss", lit "ß") + , ("o", lit "ø") + , ("O", lit "Ø") + , ("L", lit "Ł") + , ("l", lit "ł") + , ("ae", lit "æ") + , ("AE", lit "Æ") + , ("oe", lit "œ") + , ("OE", lit "Œ") + , ("pounds", lit "£") + , ("euro", lit "€") + , ("copyright", lit "©") + , ("textasciicircum", lit "^") + , ("textasciitilde", lit "~") + , ("H", accent '\779' Nothing) -- hungarumlaut + , ("`", accent '\768' (Just '`')) -- grave + , ("'", accent '\769' (Just '\'')) -- acute + , ("^", accent '\770' (Just '^')) -- circ + , ("~", accent '\771' (Just '~')) -- tilde + , ("\"", accent '\776' Nothing) -- umlaut + , (".", accent '\775' Nothing) -- dot + , ("=", accent '\772' Nothing) -- macron + , ("|", accent '\781' Nothing) -- vertical line above + , ("b", accent '\817' Nothing) -- macron below + , ("c", accent '\807' Nothing) -- cedilla + , ("G", accent '\783' Nothing) -- doublegrave + , ("h", accent '\777' Nothing) -- hookabove + , ("d", accent '\803' Nothing) -- dotbelow + , ("f", accent '\785' Nothing) -- inverted breve + , ("r", accent '\778' Nothing) -- ringabove + , ("t", accent '\865' Nothing) -- double inverted breve + , ("U", accent '\782' Nothing) -- double vertical line above + , ("v", accent '\780' Nothing) -- hacek + , ("u", accent '\774' Nothing) -- breve + , ("k", accent '\808' Nothing) -- ogonek + , ("textogonekcentered", accent '\808' Nothing) -- ogonek + , ("i", lit "ı") -- dotless i + , ("j", lit "ȷ") -- dotless j + , ("newtie", accent '\785' Nothing) -- inverted breve + , ("textcircled", accent '\8413' Nothing) -- combining circle + ] + +charCommands :: PandocMonad m => M.Map Text (LP m Inlines) +charCommands = M.fromList + [ ("ldots", lit "…") + , ("vdots", lit "\8942") + , ("dots", lit "…") + , ("mdots", lit "…") + , ("sim", lit "~") + , ("sep", lit ",") + , ("P", lit "¶") + , ("S", lit "§") + , ("$", lit "$") + , ("%", lit "%") + , ("&", lit "&") + , ("#", lit "#") + , ("_", lit "_") + , ("{", lit "{") + , ("}", lit "}") + , ("qed", lit "\a0\x25FB") + , ("lq", return (str "‘")) + , ("rq", return (str "’")) + , ("textquoteleft", return (str "‘")) + , ("textquoteright", return (str "’")) + , ("textquotedblleft", return (str "“")) + , ("textquotedblright", return (str "”")) + , ("/", pure mempty) -- italic correction + , ("\\", linebreak <$ (do inTableCell <- sInTableCell <$> getState + guard $ not inTableCell + optional rawopt + spaces)) + , (",", lit "\8198") + , ("@", pure mempty) + , (" ", lit "\160") + , ("ps", pure $ str "PS." <> space) + , ("TeX", lit "TeX") + , ("LaTeX", lit "LaTeX") + , ("bar", lit "|") + , ("textless", lit "<") + , ("textgreater", lit ">") + , ("textbackslash", lit "\\") + , ("backslash", lit "\\") + , ("slash", lit "/") + -- fontawesome + , ("faCheck", lit "\10003") + , ("faClose", lit "\10007") + -- hyphenat + , ("bshyp", lit "\\\173") + , ("fshyp", lit "/\173") + , ("dothyp", lit ".\173") + , ("colonhyp", lit ":\173") + , ("hyp", lit "-") + ] + +biblatexInlineCommands :: PandocMonad m + => LP m Inlines -> M.Map Text (LP m Inlines) +biblatexInlineCommands tok = M.fromList + -- biblatex misc + [ ("RN", romanNumeralUpper) + , ("Rn", romanNumeralLower) + , ("mkbibquote", spanWith nullAttr . doubleQuoted <$> tok) + , ("mkbibemph", spanWith nullAttr . emph <$> tok) + , ("mkbibitalic", spanWith nullAttr . emph <$> tok) + , ("mkbibbold", spanWith nullAttr . strong <$> tok) + , ("mkbibparens", + spanWith nullAttr . (\x -> str "(" <> x <> str ")") <$> tok) + , ("mkbibbrackets", + spanWith nullAttr . (\x -> str "[" <> x <> str "]") <$> tok) + , ("autocap", spanWith nullAttr <$> tok) + , ("textnormal", spanWith ("",["nodecor"],[]) <$> tok) + , ("bibstring", + (\x -> spanWith ("",[],[("bibstring",x)]) (str x)) . untokenize + <$> braced) + , ("adddot", pure (str ".")) + , ("adddotspace", pure (spanWith nullAttr (str "." <> space))) + , ("addabbrvspace", pure space) + , ("hyphen", pure (str "-")) + ] + +nameCommands :: PandocMonad m => M.Map Text (LP m Inlines) +nameCommands = M.fromList + [ ("figurename", doTerm Translations.Figure) + , ("prefacename", doTerm Translations.Preface) + , ("refname", doTerm Translations.References) + , ("bibname", doTerm Translations.Bibliography) + , ("chaptername", doTerm Translations.Chapter) + , ("partname", doTerm Translations.Part) + , ("contentsname", doTerm Translations.Contents) + , ("listfigurename", doTerm Translations.ListOfFigures) + , ("listtablename", doTerm Translations.ListOfTables) + , ("indexname", doTerm Translations.Index) + , ("abstractname", doTerm Translations.Abstract) + , ("tablename", doTerm Translations.Table) + , ("enclname", doTerm Translations.Encl) + , ("ccname", doTerm Translations.Cc) + , ("headtoname", doTerm Translations.To) + , ("pagename", doTerm Translations.Page) + , ("seename", doTerm Translations.See) + , ("seealsoname", doTerm Translations.SeeAlso) + , ("proofname", doTerm Translations.Proof) + , ("glossaryname", doTerm Translations.Glossary) + , ("lstlistingname", doTerm Translations.Listing) + ] + +refCommands :: PandocMonad m => M.Map Text (LP m Inlines) +refCommands = M.fromList + [ ("label", rawInlineOr "label" dolabel) + , ("ref", rawInlineOr "ref" $ doref "ref") + , ("cref", rawInlineOr "cref" $ doref "ref") -- from cleveref.sty + , ("vref", rawInlineOr "vref" $ doref "ref+page") -- from varioref.sty + , ("eqref", rawInlineOr "eqref" $ doref "eqref") -- from amsmath.sty + ] + +acronymCommands :: PandocMonad m => M.Map Text (LP m Inlines) +acronymCommands = M.fromList + -- glossaries package + [ ("gls", doAcronym "short") + , ("Gls", doAcronym "short") + , ("glsdesc", doAcronym "long") + , ("Glsdesc", doAcronym "long") + , ("GLSdesc", doAcronym "long") + , ("acrlong", doAcronym "long") + , ("Acrlong", doAcronym "long") + , ("acrfull", doAcronym "full") + , ("Acrfull", doAcronym "full") + , ("acrshort", doAcronym "abbrv") + , ("Acrshort", doAcronym "abbrv") + , ("glspl", doAcronymPlural "short") + , ("Glspl", doAcronymPlural "short") + , ("glsdescplural", doAcronymPlural "long") + , ("Glsdescplural", doAcronymPlural "long") + , ("GLSdescplural", doAcronymPlural "long") + -- acronyms package + , ("ac", doAcronym "short") + , ("acf", doAcronym "full") + , ("acs", doAcronym "abbrv") + , ("acl", doAcronym "long") + , ("acp", doAcronymPlural "short") + , ("acfp", doAcronymPlural "full") + , ("acsp", doAcronymPlural "abbrv") + , ("aclp", doAcronymPlural "long") + , ("Ac", doAcronym "short") + , ("Acf", doAcronym "full") + , ("Acs", doAcronym "abbrv") + , ("Acl", doAcronym "long") + , ("Acp", doAcronymPlural "short") + , ("Acfp", doAcronymPlural "full") + , ("Acsp", doAcronymPlural "abbrv") + , ("Aclp", doAcronymPlural "long") + ] + +doAcronym :: PandocMonad m => Text -> LP m Inlines +doAcronym form = do + acro <- braced + return . mconcat $ [spanWith ("",[],[("acronym-label", untokenize acro), + ("acronym-form", "singular+" <> form)]) + $ str $ untokenize acro] + +doAcronymPlural :: PandocMonad m => Text -> LP m Inlines +doAcronymPlural form = do + acro <- braced + let plural = str "s" + return . mconcat $ [spanWith ("",[],[("acronym-label", untokenize acro), + ("acronym-form", "plural+" <> form)]) $ + mconcat [str $ untokenize acro, plural]] + + diff --git a/src/Text/Pandoc/Readers/LaTeX/Lang.hs b/src/Text/Pandoc/Readers/LaTeX/Lang.hs index 814b2fe79..6a8327904 100644 --- a/src/Text/Pandoc/Readers/LaTeX/Lang.hs +++ b/src/Text/Pandoc/Readers/LaTeX/Lang.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.LaTeX.Lang - Copyright : Copyright (C) 2018-2020 John MacFarlane + Copyright : Copyright (C) 2018-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -12,144 +12,223 @@ Functions for parsing polyglossia and babel language specifiers to BCP47 'Lang'. -} module Text.Pandoc.Readers.LaTeX.Lang - ( polyglossiaLangToBCP47 + ( setDefaultLanguage + , polyglossiaLangToBCP47 , babelLangToBCP47 + , enquoteCommands + , inlineLanguageCommands ) where import qualified Data.Map as M +import Data.Text (Text) import qualified Data.Text as T -import Text.Pandoc.BCP47 (Lang(..)) +import Text.Pandoc.Shared (extractSpaces) +import Text.Collate.Lang (Lang(..), renderLang) +import Text.Pandoc.Class (PandocMonad(..), setTranslations) +import Text.Pandoc.Readers.LaTeX.Parsing +import Text.Pandoc.Parsing (updateState, option, getState, QuoteContext(..), + withQuoteContext) +import Text.Pandoc.Builder (Blocks, Inlines, setMeta, str, spanWith, + singleQuoted, doubleQuoted) + +enquote :: PandocMonad m + => LP m Inlines + -> Bool -> Maybe Text -> LP m Inlines +enquote tok starred mblang = do + skipopts + let lang = mblang >>= babelLangToBCP47 + let langspan = case lang of + Nothing -> id + Just l -> spanWith ("",[],[("lang", renderLang l)]) + quoteContext <- sQuoteContext <$> getState + if starred || quoteContext == InDoubleQuote + then singleQuoted . langspan <$> withQuoteContext InSingleQuote tok + else doubleQuoted . langspan <$> withQuoteContext InDoubleQuote tok + +enquoteCommands :: PandocMonad m + => LP m Inlines -> M.Map Text (LP m Inlines) +enquoteCommands tok = M.fromList + [ ("enquote*", enquote tok True Nothing) + , ("enquote", enquote tok False Nothing) + -- foreignquote is supposed to use native quote marks + , ("foreignquote*", braced >>= enquote tok True . Just . untokenize) + , ("foreignquote", braced >>= enquote tok False . Just . untokenize) + -- hypehnquote uses regular quotes + , ("hyphenquote*", braced >>= enquote tok True . Just . untokenize) + , ("hyphenquote", braced >>= enquote tok False . Just . untokenize) + ] + +foreignlanguage :: PandocMonad m => LP m Inlines -> LP m Inlines +foreignlanguage tok = do + babelLang <- untokenize <$> braced + case babelLangToBCP47 babelLang of + Just lang -> spanWith ("", [], [("lang", renderLang lang)]) <$> tok + _ -> tok + +inlineLanguageCommands :: PandocMonad m + => LP m Inlines -> M.Map Text (LP m Inlines) +inlineLanguageCommands tok = + M.fromList $ + ("foreignlanguage", foreignlanguage tok) : + (mk <$> M.toList polyglossiaLangToBCP47) + where + mk (polyglossia, bcp47Func) = + ("text" <> polyglossia, inlineLanguage tok bcp47Func) + +inlineLanguage :: PandocMonad m + => LP m Inlines -> (Text -> Lang) -> LP m Inlines +inlineLanguage tok bcp47Func = do + o <- option "" $ T.filter (\c -> c /= '[' && c /= ']') + <$> rawopt + let lang = renderLang $ bcp47Func o + extractSpaces (spanWith ("", [], [("lang", lang)])) <$> tok + +setDefaultLanguage :: PandocMonad m => LP m Blocks +setDefaultLanguage = do + o <- option "" $ T.filter (\c -> c /= '[' && c /= ']') + <$> rawopt + polylang <- untokenize <$> braced + case M.lookup polylang polyglossiaLangToBCP47 of + Nothing -> return mempty -- TODO mzero? warning? + Just langFunc -> do + let l = langFunc o + setTranslations l + updateState $ setMeta "lang" $ str (renderLang l) + return mempty polyglossiaLangToBCP47 :: M.Map T.Text (T.Text -> Lang) polyglossiaLangToBCP47 = M.fromList [ ("arabic", \o -> case T.filter (/=' ') o of - "locale=algeria" -> Lang "ar" "" "DZ" [] - "locale=mashriq" -> Lang "ar" "" "SY" [] - "locale=libya" -> Lang "ar" "" "LY" [] - "locale=morocco" -> Lang "ar" "" "MA" [] - "locale=mauritania" -> Lang "ar" "" "MR" [] - "locale=tunisia" -> Lang "ar" "" "TN" [] - _ -> Lang "ar" "" "" []) + "locale=algeria" -> Lang "ar" Nothing (Just "DZ") [] [] [] + "locale=mashriq" -> Lang "ar" Nothing (Just "SY") [] [] [] + "locale=libya" -> Lang "ar" Nothing (Just "LY") [] [] [] + "locale=morocco" -> Lang "ar" Nothing (Just "MA") [] [] [] + "locale=mauritania" -> Lang "ar" Nothing (Just "MR") [] [] [] + "locale=tunisia" -> Lang "ar" Nothing (Just "TN") [] [] [] + _ -> Lang "ar" Nothing (Just "") [] [] []) , ("german", \o -> case T.filter (/=' ') o of - "spelling=old" -> Lang "de" "" "DE" ["1901"] + "spelling=old" -> Lang "de" Nothing (Just "DE") ["1901"] [] [] "variant=austrian,spelling=old" - -> Lang "de" "" "AT" ["1901"] - "variant=austrian" -> Lang "de" "" "AT" [] + -> Lang "de" Nothing (Just "AT") ["1901"] [] [] + "variant=austrian" -> Lang "de" Nothing (Just "AT") [] [] [] "variant=swiss,spelling=old" - -> Lang "de" "" "CH" ["1901"] - "variant=swiss" -> Lang "de" "" "CH" [] - _ -> Lang "de" "" "" []) - , ("lsorbian", \_ -> Lang "dsb" "" "" []) + -> Lang "de" Nothing (Just "CH") ["1901"] [] [] + "variant=swiss" -> Lang "de" Nothing (Just "CH") [] [] [] + _ -> Lang "de" Nothing Nothing [] [] []) + , ("lsorbian", \_ -> Lang "dsb" Nothing Nothing [] [] []) , ("greek", \o -> case T.filter (/=' ') o of - "variant=poly" -> Lang "el" "" "polyton" [] - "variant=ancient" -> Lang "grc" "" "" [] - _ -> Lang "el" "" "" []) + "variant=poly" -> Lang "el" Nothing (Just "polyton") [] [] [] + "variant=ancient" -> Lang "grc" Nothing Nothing [] [] [] + _ -> Lang "el" Nothing Nothing [] [] []) , ("english", \o -> case T.filter (/=' ') o of - "variant=australian" -> Lang "en" "" "AU" [] - "variant=canadian" -> Lang "en" "" "CA" [] - "variant=british" -> Lang "en" "" "GB" [] - "variant=newzealand" -> Lang "en" "" "NZ" [] - "variant=american" -> Lang "en" "" "US" [] - _ -> Lang "en" "" "" []) - , ("usorbian", \_ -> Lang "hsb" "" "" []) + "variant=australian" -> Lang "en" Nothing (Just "AU") [] [] [] + "variant=canadian" -> Lang "en" Nothing (Just "CA") [] [] [] + "variant=british" -> Lang "en" Nothing (Just "GB") [] [] [] + "variant=newzealand" -> Lang "en" Nothing (Just "NZ") [] [] [] + "variant=american" -> Lang "en" Nothing (Just "US") [] [] [] + _ -> Lang "en" Nothing (Just "") [] [] []) + , ("usorbian", \_ -> Lang "hsb" Nothing Nothing [] [] []) , ("latin", \o -> case T.filter (/=' ') o of - "variant=classic" -> Lang "la" "" "" ["x-classic"] - _ -> Lang "la" "" "" []) - , ("slovenian", \_ -> Lang "sl" "" "" []) - , ("serbianc", \_ -> Lang "sr" "cyrl" "" []) - , ("pinyin", \_ -> Lang "zh" "Latn" "" ["pinyin"]) - , ("afrikaans", \_ -> Lang "af" "" "" []) - , ("amharic", \_ -> Lang "am" "" "" []) - , ("assamese", \_ -> Lang "as" "" "" []) - , ("asturian", \_ -> Lang "ast" "" "" []) - , ("bulgarian", \_ -> Lang "bg" "" "" []) - , ("bengali", \_ -> Lang "bn" "" "" []) - , ("tibetan", \_ -> Lang "bo" "" "" []) - , ("breton", \_ -> Lang "br" "" "" []) - , ("catalan", \_ -> Lang "ca" "" "" []) - , ("welsh", \_ -> Lang "cy" "" "" []) - , ("czech", \_ -> Lang "cs" "" "" []) - , ("coptic", \_ -> Lang "cop" "" "" []) - , ("danish", \_ -> Lang "da" "" "" []) - , ("divehi", \_ -> Lang "dv" "" "" []) - , ("esperanto", \_ -> Lang "eo" "" "" []) - , ("spanish", \_ -> Lang "es" "" "" []) - , ("estonian", \_ -> Lang "et" "" "" []) - , ("basque", \_ -> Lang "eu" "" "" []) - , ("farsi", \_ -> Lang "fa" "" "" []) - , ("finnish", \_ -> Lang "fi" "" "" []) - , ("french", \_ -> Lang "fr" "" "" []) - , ("friulan", \_ -> Lang "fur" "" "" []) - , ("irish", \_ -> Lang "ga" "" "" []) - , ("scottish", \_ -> Lang "gd" "" "" []) - , ("ethiopic", \_ -> Lang "gez" "" "" []) - , ("galician", \_ -> Lang "gl" "" "" []) - , ("hebrew", \_ -> Lang "he" "" "" []) - , ("hindi", \_ -> Lang "hi" "" "" []) - , ("croatian", \_ -> Lang "hr" "" "" []) - , ("magyar", \_ -> Lang "hu" "" "" []) - , ("armenian", \_ -> Lang "hy" "" "" []) - , ("interlingua", \_ -> Lang "ia" "" "" []) - , ("indonesian", \_ -> Lang "id" "" "" []) - , ("icelandic", \_ -> Lang "is" "" "" []) - , ("italian", \_ -> Lang "it" "" "" []) - , ("japanese", \_ -> Lang "jp" "" "" []) - , ("khmer", \_ -> Lang "km" "" "" []) - , ("kurmanji", \_ -> Lang "kmr" "" "" []) - , ("kannada", \_ -> Lang "kn" "" "" []) - , ("korean", \_ -> Lang "ko" "" "" []) - , ("lao", \_ -> Lang "lo" "" "" []) - , ("lithuanian", \_ -> Lang "lt" "" "" []) - , ("latvian", \_ -> Lang "lv" "" "" []) - , ("malayalam", \_ -> Lang "ml" "" "" []) - , ("mongolian", \_ -> Lang "mn" "" "" []) - , ("marathi", \_ -> Lang "mr" "" "" []) - , ("dutch", \_ -> Lang "nl" "" "" []) - , ("nynorsk", \_ -> Lang "nn" "" "" []) - , ("norsk", \_ -> Lang "no" "" "" []) - , ("nko", \_ -> Lang "nqo" "" "" []) - , ("occitan", \_ -> Lang "oc" "" "" []) - , ("panjabi", \_ -> Lang "pa" "" "" []) - , ("polish", \_ -> Lang "pl" "" "" []) - , ("piedmontese", \_ -> Lang "pms" "" "" []) - , ("portuguese", \_ -> Lang "pt" "" "" []) - , ("romansh", \_ -> Lang "rm" "" "" []) - , ("romanian", \_ -> Lang "ro" "" "" []) - , ("russian", \_ -> Lang "ru" "" "" []) - , ("sanskrit", \_ -> Lang "sa" "" "" []) - , ("samin", \_ -> Lang "se" "" "" []) - , ("slovak", \_ -> Lang "sk" "" "" []) - , ("albanian", \_ -> Lang "sq" "" "" []) - , ("serbian", \_ -> Lang "sr" "" "" []) - , ("swedish", \_ -> Lang "sv" "" "" []) - , ("syriac", \_ -> Lang "syr" "" "" []) - , ("tamil", \_ -> Lang "ta" "" "" []) - , ("telugu", \_ -> Lang "te" "" "" []) - , ("thai", \_ -> Lang "th" "" "" []) - , ("turkmen", \_ -> Lang "tk" "" "" []) - , ("turkish", \_ -> Lang "tr" "" "" []) - , ("ukrainian", \_ -> Lang "uk" "" "" []) - , ("urdu", \_ -> Lang "ur" "" "" []) - , ("vietnamese", \_ -> Lang "vi" "" "" []) + "variant=classic" -> Lang "la" Nothing Nothing ["x-classic"] [] [] + _ -> Lang "la" Nothing Nothing [] [] []) + , ("slovenian", \_ -> Lang "sl" Nothing Nothing [] [] []) + , ("serbianc", \_ -> Lang "sr" (Just "Cyrl") Nothing [] [] []) + , ("pinyin", \_ -> Lang "zh" (Just "Latn") Nothing ["pinyin"] [] []) + , ("afrikaans", \_ -> simpleLang "af") + , ("amharic", \_ -> simpleLang "am") + , ("assamese", \_ -> simpleLang "as") + , ("asturian", \_ -> simpleLang "ast") + , ("bulgarian", \_ -> simpleLang "bg") + , ("bengali", \_ -> simpleLang "bn") + , ("tibetan", \_ -> simpleLang "bo") + , ("breton", \_ -> simpleLang "br") + , ("catalan", \_ -> simpleLang "ca") + , ("welsh", \_ -> simpleLang "cy") + , ("czech", \_ -> simpleLang "cs") + , ("coptic", \_ -> simpleLang "cop") + , ("danish", \_ -> simpleLang "da") + , ("divehi", \_ -> simpleLang "dv") + , ("esperanto", \_ -> simpleLang "eo") + , ("spanish", \_ -> simpleLang "es") + , ("estonian", \_ -> simpleLang "et") + , ("basque", \_ -> simpleLang "eu") + , ("farsi", \_ -> simpleLang "fa") + , ("finnish", \_ -> simpleLang "fi") + , ("french", \_ -> simpleLang "fr") + , ("friulan", \_ -> simpleLang "fur") + , ("irish", \_ -> simpleLang "ga") + , ("scottish", \_ -> simpleLang "gd") + , ("ethiopic", \_ -> simpleLang "gez") + , ("galician", \_ -> simpleLang "gl") + , ("hebrew", \_ -> simpleLang "he") + , ("hindi", \_ -> simpleLang "hi") + , ("croatian", \_ -> simpleLang "hr") + , ("magyar", \_ -> simpleLang "hu") + , ("armenian", \_ -> simpleLang "hy") + , ("interlingua", \_ -> simpleLang "ia") + , ("indonesian", \_ -> simpleLang "id") + , ("icelandic", \_ -> simpleLang "is") + , ("italian", \_ -> simpleLang "it") + , ("japanese", \_ -> simpleLang "jp") + , ("khmer", \_ -> simpleLang "km") + , ("kurmanji", \_ -> simpleLang "kmr") + , ("kannada", \_ -> simpleLang "kn") + , ("korean", \_ -> simpleLang "ko") + , ("lao", \_ -> simpleLang "lo") + , ("lithuanian", \_ -> simpleLang "lt") + , ("latvian", \_ -> simpleLang "lv") + , ("malayalam", \_ -> simpleLang "ml") + , ("mongolian", \_ -> simpleLang "mn") + , ("marathi", \_ -> simpleLang "mr") + , ("dutch", \_ -> simpleLang "nl") + , ("nynorsk", \_ -> simpleLang "nn") + , ("norsk", \_ -> simpleLang "no") + , ("nko", \_ -> simpleLang "nqo") + , ("occitan", \_ -> simpleLang "oc") + , ("panjabi", \_ -> simpleLang "pa") + , ("polish", \_ -> simpleLang "pl") + , ("piedmontese", \_ -> simpleLang "pms") + , ("portuguese", \_ -> simpleLang "pt") + , ("romansh", \_ -> simpleLang "rm") + , ("romanian", \_ -> simpleLang "ro") + , ("russian", \_ -> simpleLang "ru") + , ("sanskrit", \_ -> simpleLang "sa") + , ("samin", \_ -> simpleLang "se") + , ("slovak", \_ -> simpleLang "sk") + , ("albanian", \_ -> simpleLang "sq") + , ("serbian", \_ -> simpleLang "sr") + , ("swedish", \_ -> simpleLang "sv") + , ("syriac", \_ -> simpleLang "syr") + , ("tamil", \_ -> simpleLang "ta") + , ("telugu", \_ -> simpleLang "te") + , ("thai", \_ -> simpleLang "th") + , ("turkmen", \_ -> simpleLang "tk") + , ("turkish", \_ -> simpleLang "tr") + , ("ukrainian", \_ -> simpleLang "uk") + , ("urdu", \_ -> simpleLang "ur") + , ("vietnamese", \_ -> simpleLang "vi") ] +simpleLang :: Text -> Lang +simpleLang l = Lang l Nothing Nothing [] [] [] + babelLangToBCP47 :: T.Text -> Maybe Lang babelLangToBCP47 s = case s of - "austrian" -> Just $ Lang "de" "" "AT" ["1901"] - "naustrian" -> Just $ Lang "de" "" "AT" [] - "swissgerman" -> Just $ Lang "de" "" "CH" ["1901"] - "nswissgerman" -> Just $ Lang "de" "" "CH" [] - "german" -> Just $ Lang "de" "" "DE" ["1901"] - "ngerman" -> Just $ Lang "de" "" "DE" [] - "lowersorbian" -> Just $ Lang "dsb" "" "" [] - "uppersorbian" -> Just $ Lang "hsb" "" "" [] - "polutonikogreek" -> Just $ Lang "el" "" "" ["polyton"] - "slovene" -> Just $ Lang "sl" "" "" [] - "australian" -> Just $ Lang "en" "" "AU" [] - "canadian" -> Just $ Lang "en" "" "CA" [] - "british" -> Just $ Lang "en" "" "GB" [] - "newzealand" -> Just $ Lang "en" "" "NZ" [] - "american" -> Just $ Lang "en" "" "US" [] - "classiclatin" -> Just $ Lang "la" "" "" ["x-classic"] + "austrian" -> Just $ Lang "de" Nothing (Just "AT") ["1901"] [] [] + "naustrian" -> Just $ Lang "de" Nothing (Just "AT") [] [] [] + "swissgerman" -> Just $ Lang "de" Nothing (Just "CH") ["1901"] [] [] + "nswissgerman" -> Just $ Lang "de" Nothing (Just "CH") [] [] [] + "german" -> Just $ Lang "de" Nothing (Just "DE") ["1901"] [] [] + "ngerman" -> Just $ Lang "de" Nothing (Just "DE") [] [] [] + "lowersorbian" -> Just $ Lang "dsb" Nothing Nothing [] [] [] + "uppersorbian" -> Just $ Lang "hsb" Nothing Nothing [] [] [] + "polutonikogreek" -> Just $ Lang "el" Nothing Nothing ["polyton"] [] [] + "slovene" -> Just $ simpleLang "sl" + "australian" -> Just $ Lang "en" Nothing (Just "AU") [] [] [] + "canadian" -> Just $ Lang "en" Nothing (Just "CA") [] [] [] + "british" -> Just $ Lang "en" Nothing (Just "GB") [] [] [] + "newzealand" -> Just $ Lang "en" Nothing (Just "NZ") [] [] [] + "american" -> Just $ Lang "en" Nothing (Just "US") [] [] [] + "classiclatin" -> Just $ Lang "la" Nothing Nothing ["x-classic"] [] [] _ -> ($ "") <$> M.lookup s polyglossiaLangToBCP47 diff --git a/src/Text/Pandoc/Readers/LaTeX/Macro.hs b/src/Text/Pandoc/Readers/LaTeX/Macro.hs new file mode 100644 index 000000000..5495a8e74 --- /dev/null +++ b/src/Text/Pandoc/Readers/LaTeX/Macro.hs @@ -0,0 +1,184 @@ +{-# LANGUAGE OverloadedStrings #-} +module Text.Pandoc.Readers.LaTeX.Macro + ( macroDef + ) +where +import Text.Pandoc.Extensions (Extension(..)) +import Text.Pandoc.Logging (LogMessage(MacroAlreadyDefined)) +import Text.Pandoc.Readers.LaTeX.Parsing +import Text.Pandoc.Readers.LaTeX.Types +import Text.Pandoc.Class +import Text.Pandoc.Shared (safeRead) +import Text.Pandoc.Parsing hiding (blankline, mathDisplay, mathInline, + optional, space, spaces, withRaw, (<|>)) +import Control.Applicative ((<|>), optional) +import qualified Data.Map as M +import Data.Text (Text) +import qualified Data.Text as T + +macroDef :: (PandocMonad m, Monoid a) => (Text -> a) -> LP m a +macroDef constructor = do + (_, s) <- withRaw (commandDef <|> environmentDef) + (constructor (untokenize s) <$ + guardDisabled Ext_latex_macros) + <|> return mempty + where commandDef = do + nameMacroPairs <- newcommand <|> letmacro <|> defmacro <|> newif + guardDisabled Ext_latex_macros <|> + mapM_ (\(name, macro') -> + updateState (\s -> s{ sMacros = M.insert name macro' + (sMacros s) })) nameMacroPairs + environmentDef = do + mbenv <- newenvironment + case mbenv of + Nothing -> return () + Just (name, macro1, macro2) -> + guardDisabled Ext_latex_macros <|> + do updateState $ \s -> s{ sMacros = + M.insert name macro1 (sMacros s) } + updateState $ \s -> s{ sMacros = + M.insert ("end" <> name) macro2 (sMacros s) } + -- @\newenvironment{envname}[n-args][default]{begin}{end}@ + -- is equivalent to + -- @\newcommand{\envname}[n-args][default]{begin}@ + -- @\newcommand{\endenvname}@ + +letmacro :: PandocMonad m => LP m [(Text, Macro)] +letmacro = do + controlSeq "let" + (name, contents) <- withVerbatimMode $ do + Tok _ (CtrlSeq name) _ <- anyControlSeq + optional $ symbol '=' + spaces + -- we first parse in verbatim mode, and then expand macros, + -- because we don't want \let\foo\bar to turn into + -- \let\foo hello if we have previously \def\bar{hello} + contents <- bracedOrToken + return (name, contents) + contents' <- doMacros' 0 contents + return [(name, Macro ExpandWhenDefined [] Nothing contents')] + +defmacro :: PandocMonad m => LP m [(Text, Macro)] +defmacro = do + -- we use withVerbatimMode, because macros are to be expanded + -- at point of use, not point of definition + controlSeq "def" + withVerbatimMode $ do + Tok _ (CtrlSeq name) _ <- anyControlSeq + argspecs <- many (argspecArg <|> argspecPattern) + contents <- bracedOrToken + return [(name, Macro ExpandWhenUsed argspecs Nothing contents)] + +-- \newif\iffoo' defines: +-- \iffoo to be \iffalse +-- \footrue to be a command that defines \iffoo to be \iftrue +-- \foofalse to be a command that defines \iffoo to be \iffalse +newif :: PandocMonad m => LP m [(Text, Macro)] +newif = do + controlSeq "newif" + withVerbatimMode $ do + Tok pos (CtrlSeq name) _ <- anyControlSeq + -- \def\iffoo\iffalse + -- \def\footrue{\def\iffoo\iftrue} + -- \def\foofalse{\def\iffoo\iffalse} + let base = T.drop 2 name + return [ (name, Macro ExpandWhenUsed [] Nothing + [Tok pos (CtrlSeq "iffalse") "\\iffalse"]) + , (base <> "true", + Macro ExpandWhenUsed [] Nothing + [ Tok pos (CtrlSeq "def") "\\def" + , Tok pos (CtrlSeq name) ("\\" <> name) + , Tok pos (CtrlSeq "iftrue") "\\iftrue" + ]) + , (base <> "false", + Macro ExpandWhenUsed [] Nothing + [ Tok pos (CtrlSeq "def") "\\def" + , Tok pos (CtrlSeq name) ("\\" <> name) + , Tok pos (CtrlSeq "iffalse") "\\iffalse" + ]) + ] + +argspecArg :: PandocMonad m => LP m ArgSpec +argspecArg = do + Tok _ (Arg i) _ <- satisfyTok isArgTok + return $ ArgNum i + +argspecPattern :: PandocMonad m => LP m ArgSpec +argspecPattern = + Pattern <$> many1 (satisfyTok (\(Tok _ toktype' txt) -> + (toktype' == Symbol || toktype' == Word) && + (txt /= "{" && txt /= "\\" && txt /= "}"))) + +newcommand :: PandocMonad m => LP m [(Text, Macro)] +newcommand = do + Tok pos (CtrlSeq mtype) _ <- controlSeq "newcommand" <|> + controlSeq "renewcommand" <|> + controlSeq "providecommand" <|> + controlSeq "DeclareMathOperator" <|> + controlSeq "DeclareRobustCommand" + withVerbatimMode $ do + Tok _ (CtrlSeq name) txt <- do + optional (symbol '*') + anyControlSeq <|> + (symbol '{' *> spaces *> anyControlSeq <* spaces <* symbol '}') + spaces + numargs <- option 0 $ try bracketedNum + let argspecs = map ArgNum [1..numargs] + spaces + optarg <- option Nothing $ Just <$> try bracketedToks + spaces + contents' <- bracedOrToken + let contents = + case mtype of + "DeclareMathOperator" -> + Tok pos (CtrlSeq "mathop") "\\mathop" + : Tok pos Symbol "{" + : Tok pos (CtrlSeq "mathrm") "\\mathrm" + : Tok pos Symbol "{" + : (contents' ++ + [ Tok pos Symbol "}", Tok pos Symbol "}" ]) + _ -> contents' + macros <- sMacros <$> getState + case M.lookup name macros of + Just macro + | mtype == "newcommand" -> do + report $ MacroAlreadyDefined txt pos + return [(name, macro)] + | mtype == "providecommand" -> return [(name, macro)] + _ -> return [(name, Macro ExpandWhenUsed argspecs optarg contents)] + +newenvironment :: PandocMonad m => LP m (Maybe (Text, Macro, Macro)) +newenvironment = do + pos <- getPosition + Tok _ (CtrlSeq mtype) _ <- controlSeq "newenvironment" <|> + controlSeq "renewenvironment" <|> + controlSeq "provideenvironment" + withVerbatimMode $ do + optional $ symbol '*' + spaces + name <- untokenize <$> braced + spaces + numargs <- option 0 $ try bracketedNum + spaces + optarg <- option Nothing $ Just <$> try bracketedToks + let argspecs = map (\i -> ArgNum i) [1..numargs] + startcontents <- spaces >> bracedOrToken + endcontents <- spaces >> bracedOrToken + macros <- sMacros <$> getState + case M.lookup name macros of + Just _ + | mtype == "newenvironment" -> do + report $ MacroAlreadyDefined name pos + return Nothing + | mtype == "provideenvironment" -> + return Nothing + _ -> return $ Just (name, + Macro ExpandWhenUsed argspecs optarg startcontents, + Macro ExpandWhenUsed [] Nothing endcontents) + +bracketedNum :: PandocMonad m => LP m Int +bracketedNum = do + ds <- untokenize <$> bracketedToks + case safeRead ds of + Just i -> return i + _ -> return 0 diff --git a/src/Text/Pandoc/Readers/LaTeX/Math.hs b/src/Text/Pandoc/Readers/LaTeX/Math.hs new file mode 100644 index 000000000..5b49a0376 --- /dev/null +++ b/src/Text/Pandoc/Readers/LaTeX/Math.hs @@ -0,0 +1,221 @@ +{-# LANGUAGE OverloadedStrings #-} +module Text.Pandoc.Readers.LaTeX.Math + ( dollarsMath + , inlineEnvironments + , inlineEnvironment + , mathInline + , mathDisplay + , theoremstyle + , theoremEnvironment + , newtheorem + , proof + ) +where +import Data.Maybe (fromMaybe) +import Text.Pandoc.Walk (walk) +import Text.Pandoc.Builder as B +import qualified Data.Sequence as Seq +import Text.Pandoc.Readers.LaTeX.Parsing +import Text.Pandoc.Readers.LaTeX.Types +import Text.Pandoc.Class +import Text.Pandoc.Shared (trimMath, stripTrailingNewlines) +import Text.Pandoc.Parsing hiding (blankline, mathDisplay, mathInline, + optional, space, spaces, withRaw, (<|>)) +import Control.Applicative ((<|>), optional) +import Control.Monad (guard, mzero) +import qualified Data.Map as M +import Data.Text (Text) + +dollarsMath :: PandocMonad m => LP m Inlines +dollarsMath = do + symbol '$' + display <- option False (True <$ symbol '$') + (do contents <- try $ untokenize <$> pDollarsMath 0 + if display + then mathDisplay contents <$ symbol '$' + else return $ mathInline contents) + <|> (guard display >> return (mathInline "")) + +-- Int is number of embedded groupings +pDollarsMath :: PandocMonad m => Int -> LP m [Tok] +pDollarsMath n = do + tk@(Tok _ toktype t) <- anyTok + case toktype of + Symbol | t == "$" + , n == 0 -> return [] + | t == "\\" -> do + tk' <- anyTok + (tk :) . (tk' :) <$> pDollarsMath n + | t == "{" -> (tk :) <$> pDollarsMath (n+1) + | t == "}" -> + if n > 0 + then (tk :) <$> pDollarsMath (n-1) + else mzero + _ -> (tk :) <$> pDollarsMath n + +mathDisplay :: Text -> Inlines +mathDisplay = displayMath . trimMath + +mathInline :: Text -> Inlines +mathInline = math . trimMath + +mathEnvWith :: PandocMonad m + => (Inlines -> a) -> Maybe Text -> Text -> LP m a +mathEnvWith f innerEnv name = f . mathDisplay . inner <$> mathEnv name + where inner x = case innerEnv of + Nothing -> x + Just y -> "\\begin{" <> y <> "}\n" <> x <> + "\\end{" <> y <> "}" + +mathEnv :: PandocMonad m => Text -> LP m Text +mathEnv name = do + skipopts + optional blankline + res <- manyTill anyTok (end_ name) + return $ stripTrailingNewlines $ untokenize res + +inlineEnvironment :: PandocMonad m => LP m Inlines +inlineEnvironment = try $ do + controlSeq "begin" + name <- untokenize <$> braced + M.findWithDefault mzero name inlineEnvironments + +inlineEnvironments :: PandocMonad m => M.Map Text (LP m Inlines) +inlineEnvironments = M.fromList [ + ("displaymath", mathEnvWith id Nothing "displaymath") + , ("math", math <$> mathEnv "math") + , ("equation", mathEnvWith id Nothing "equation") + , ("equation*", mathEnvWith id Nothing "equation*") + , ("gather", mathEnvWith id (Just "gathered") "gather") + , ("gather*", mathEnvWith id (Just "gathered") "gather*") + , ("multline", mathEnvWith id (Just "gathered") "multline") + , ("multline*", mathEnvWith id (Just "gathered") "multline*") + , ("eqnarray", mathEnvWith id (Just "aligned") "eqnarray") + , ("eqnarray*", mathEnvWith id (Just "aligned") "eqnarray*") + , ("align", mathEnvWith id (Just "aligned") "align") + , ("align*", mathEnvWith id (Just "aligned") "align*") + , ("alignat", mathEnvWith id (Just "aligned") "alignat") + , ("alignat*", mathEnvWith id (Just "aligned") "alignat*") + , ("dmath", mathEnvWith id Nothing "dmath") + , ("dmath*", mathEnvWith id Nothing "dmath*") + , ("dgroup", mathEnvWith id (Just "aligned") "dgroup") + , ("dgroup*", mathEnvWith id (Just "aligned") "dgroup*") + , ("darray", mathEnvWith id (Just "aligned") "darray") + , ("darray*", mathEnvWith id (Just "aligned") "darray*") + ] + +theoremstyle :: PandocMonad m => LP m Blocks +theoremstyle = do + stylename <- untokenize <$> braced + let mbstyle = case stylename of + "plain" -> Just PlainStyle + "definition" -> Just DefinitionStyle + "remark" -> Just RemarkStyle + _ -> Nothing + case mbstyle of + Nothing -> return () + Just sty -> updateState $ \s -> s{ sLastTheoremStyle = sty } + return mempty + +newtheorem :: PandocMonad m => LP m Inlines -> LP m Blocks +newtheorem inline = do + number <- option True (False <$ symbol '*' <* sp) + name <- untokenize <$> braced + sp + series <- option Nothing $ Just . untokenize <$> bracketedToks + sp + showName <- tokWith inline + sp + syncTo <- option Nothing $ Just . untokenize <$> bracketedToks + sty <- sLastTheoremStyle <$> getState + let spec = TheoremSpec { theoremName = showName + , theoremStyle = sty + , theoremSeries = series + , theoremSyncTo = syncTo + , theoremNumber = number + , theoremLastNum = DottedNum [0] } + tmap <- sTheoremMap <$> getState + updateState $ \s -> s{ sTheoremMap = + M.insert name spec tmap } + return mempty + +theoremEnvironment :: PandocMonad m + => LP m Blocks -> LP m Inlines -> Text -> LP m Blocks +theoremEnvironment blocks opt name = do + tmap <- sTheoremMap <$> getState + case M.lookup name tmap of + Nothing -> mzero + Just tspec -> do + optTitle <- option mempty $ (\x -> space <> "(" <> x <> ")") <$> opt + mblabel <- option Nothing $ Just . untokenize <$> + try (spaces >> controlSeq "label" >> spaces >> braced) + bs <- env name blocks + number <- + if theoremNumber tspec + then do + let name' = fromMaybe name $ theoremSeries tspec + num <- getNextNumber + (maybe (DottedNum [0]) theoremLastNum . + M.lookup name' . sTheoremMap) + updateState $ \s -> + s{ sTheoremMap = + M.adjust + (\spec -> spec{ theoremLastNum = num }) + name' + (sTheoremMap s) + } + + case mblabel of + Just ident -> + updateState $ \s -> + s{ sLabels = M.insert ident + (B.toList $ + theoremName tspec <> "\160" <> + str (renderDottedNum num)) (sLabels s) } + Nothing -> return () + return $ space <> B.text (renderDottedNum num) + else return mempty + let titleEmph = case theoremStyle tspec of + PlainStyle -> B.strong + DefinitionStyle -> B.strong + RemarkStyle -> B.emph + let title = titleEmph (theoremName tspec <> number) + <> optTitle <> "." <> space + return $ divWith (fromMaybe "" mblabel, [name], []) $ addTitle title + $ case theoremStyle tspec of + PlainStyle -> walk italicize bs + _ -> bs + + + +proof :: PandocMonad m => LP m Blocks -> LP m Inlines -> LP m Blocks +proof blocks opt = do + title <- option (B.text "Proof") opt + bs <- env "proof" blocks + return $ + B.divWith ("", ["proof"], []) $ + addQed $ addTitle (B.emph (title <> ".")) bs + +addTitle :: Inlines -> Blocks -> Blocks +addTitle ils bs = + case B.toList bs of + (Para xs : rest) + -> B.fromList (Para (B.toList ils ++ (Space : xs)) : rest) + _ -> B.para ils <> bs + +addQed :: Blocks -> Blocks +addQed bs = + case Seq.viewr (B.unMany bs) of + s Seq.:> Para ils + -> B.Many (s Seq.|> Para (ils ++ B.toList qedSign)) + _ -> bs <> B.para qedSign + where + qedSign = B.str "\xa0\x25FB" + +italicize :: Block -> Block +italicize x@(Para [Image{}]) = x -- see #6925 +italicize (Para ils) = Para [Emph ils] +italicize (Plain ils) = Plain [Emph ils] +italicize x = x + + diff --git a/src/Text/Pandoc/Readers/LaTeX/Parsing.hs b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs index 563d32883..9dac4d6ef 100644 --- a/src/Text/Pandoc/Readers/LaTeX/Parsing.hs +++ b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs @@ -1,11 +1,12 @@ {-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} {- | Module : Text.Pandoc.Readers.LaTeX.Parsing - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -27,11 +28,15 @@ module Text.Pandoc.Readers.LaTeX.Parsing , rawLaTeXParser , applyMacros , tokenize + , tokenizeSources + , getInputTokens , untokenize , untoken , totoks , toksToString , satisfyTok + , parseFromToks + , disablingWithRaw , doMacros , doMacros' , setpos @@ -52,6 +57,7 @@ module Text.Pandoc.Readers.LaTeX.Parsing , comment , anyTok , singleChar + , tokWith , specialChars , endline , blankline @@ -78,6 +84,11 @@ module Text.Pandoc.Readers.LaTeX.Parsing , rawopt , overlaySpecification , getNextNumber + , label + , setCaption + , resetCaption + , env + , addMeta ) where import Control.Applicative (many, (<|>)) @@ -87,13 +98,15 @@ import Control.Monad.Trans (lift) import Data.Char (chr, isAlphaNum, isDigit, isLetter, ord) import Data.Default import Data.List (intercalate) +import qualified Data.IntMap as IntMap import qualified Data.Map as M import qualified Data.Set as Set import Data.Text (Text) import qualified Data.Text as T import Text.Pandoc.Builder import Text.Pandoc.Class.PandocMonad (PandocMonad, report) -import Text.Pandoc.Error (PandocError (PandocMacroLoop)) +import Text.Pandoc.Error + (PandocError (PandocMacroLoop,PandocShouldNeverHappenError)) import Text.Pandoc.Logging import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (blankline, many, mathDisplay, mathInline, @@ -102,7 +115,6 @@ import Text.Pandoc.Readers.LaTeX.Types (ExpansionPoint (..), Macro (..), ArgSpec (..), Tok (..), TokType (..)) import Text.Pandoc.Shared import Text.Parsec.Pos --- import Debug.Trace newtype DottedNum = DottedNum [Int] deriving (Show, Eq) @@ -151,7 +163,9 @@ data LaTeXState = LaTeXState{ sOptions :: ReaderOptions , sLabels :: M.Map Text [Inline] , sHasChapters :: Bool , sToggles :: M.Map Text Bool - , sExpanded :: Bool + , sFileContents :: M.Map Text Text + , sEnableWithRaw :: Bool + , sRawTokens :: IntMap.IntMap [Tok] } deriving Show @@ -176,7 +190,9 @@ defaultLaTeXState = LaTeXState{ sOptions = def , sLabels = M.empty , sHasChapters = False , sToggles = M.empty - , sExpanded = False + , sFileContents = M.empty + , sEnableWithRaw = True + , sRawTokens = IntMap.empty } instance PandocMonad m => HasQuoteContext LaTeXState m where @@ -232,21 +248,25 @@ withVerbatimMode parser = do updateState $ \st -> st{ sVerbatimMode = False } return result -rawLaTeXParser :: (PandocMonad m, HasMacros s, HasReaderOptions s) +rawLaTeXParser :: (PandocMonad m, HasMacros s, HasReaderOptions s, Show a) => [Tok] -> Bool -> LP m a -> LP m a - -> ParserT Text s m (a, Text) + -> ParserT Sources s m (a, Text) rawLaTeXParser toks retokenize parser valParser = do pstate <- getState let lstate = def{ sOptions = extractReaderOptions pstate } let lstate' = lstate { sMacros = extractMacros pstate } + let setStartPos = case toks of + Tok pos _ _ : _ -> setPosition pos + _ -> return () + let preparser = setStartPos >> parser let rawparser = (,) <$> withRaw valParser <*> getState - res' <- lift $ runParserT (snd <$> withRaw parser) lstate "chunk" toks + res' <- lift $ runParserT (withRaw (preparser >> getPosition)) + lstate "chunk" toks case res' of Left _ -> mzero - Right toks' -> do + Right (endpos, toks') -> do res <- lift $ runParserT (do when retokenize $ do -- retokenize, applying macros - doMacros ts <- many (satisfyTok (const True)) setInput ts rawparser) @@ -255,7 +275,13 @@ rawLaTeXParser toks retokenize parser valParser = do Left _ -> mzero Right ((val, raw), st) -> do updateState (updateMacros (sMacros st <>)) - _ <- takeP (T.length (untokenize toks')) + let skipTilPos stopPos = do + anyChar + pos <- getPosition + if pos >= stopPos + then return () + else skipTilPos stopPos + skipTilPos endpos let result = untokenize raw -- ensure we end with space if input did, see #4442 let result' = @@ -268,7 +294,7 @@ rawLaTeXParser toks retokenize parser valParser = do return (val, result') applyMacros :: (PandocMonad m, HasMacros s, HasReaderOptions s) - => Text -> ParserT Text s m Text + => Text -> ParserT Sources s m Text applyMacros s = (guardDisabled Ext_latex_macros >> return s) <|> do let retokenize = untokenize <$> many (satisfyTok (const True)) pstate <- getState @@ -279,6 +305,31 @@ applyMacros s = (guardDisabled Ext_latex_macros >> return s) <|> Left e -> Prelude.fail (show e) Right s' -> return s' +{- +When tokenize or untokenize change, test with this +QuickCheck property: + +> tokUntokRoundtrip :: String -> Bool +> tokUntokRoundtrip s = +> let t = T.pack s in untokenize (tokenize "random" t) == t +-} + +tokenizeSources :: Sources -> [Tok] +tokenizeSources = concatMap tokenizeSource . unSources + where + tokenizeSource (pos, t) = totoks pos t + +-- Return tokens from input sources. Ensure that starting position is +-- correct. +getInputTokens :: PandocMonad m => ParserT Sources s m [Tok] +getInputTokens = do + pos <- getPosition + ss <- getInput + return $ + case ss of + Sources [] -> [] + Sources ((_,t):rest) -> tokenizeSources $ Sources ((pos,t):rest) + tokenize :: SourceName -> Text -> [Tok] tokenize sourcename = totoks (initialPos sourcename) @@ -402,41 +453,62 @@ untoken t = untokenAccum t mempty toksToString :: [Tok] -> String toksToString = T.unpack . untokenize +parseFromToks :: PandocMonad m => LP m a -> [Tok] -> LP m a +parseFromToks parser toks = do + oldInput <- getInput + setInput toks + oldpos <- getPosition + case toks of + Tok pos _ _ : _ -> setPosition pos + _ -> return () + result <- disablingWithRaw parser + setInput oldInput + setPosition oldpos + return result + +disablingWithRaw :: PandocMonad m => LP m a -> LP m a +disablingWithRaw parser = do + oldEnableWithRaw <- sEnableWithRaw <$> getState + updateState $ \st -> st{ sEnableWithRaw = False } + result <- parser + updateState $ \st -> st{ sEnableWithRaw = oldEnableWithRaw } + return result + satisfyTok :: PandocMonad m => (Tok -> Bool) -> LP m Tok satisfyTok f = do doMacros -- apply macros on remaining input stream res <- tokenPrim (T.unpack . untoken) updatePos matcher - updateState $ \st -> st{ sExpanded = False } - return res + updateState $ \st -> + if sEnableWithRaw st + then st{ sRawTokens = IntMap.map (res:) $ sRawTokens st } + else st + return $! res where matcher t | f t = Just t | otherwise = Nothing updatePos :: SourcePos -> Tok -> [Tok] -> SourcePos updatePos _spos _ (Tok pos _ _ : _) = pos - updatePos spos _ [] = incSourceColumn spos 1 + updatePos spos (Tok _ _ t) [] = incSourceColumn spos (T.length t) doMacros :: PandocMonad m => LP m () doMacros = do - expanded <- sExpanded <$> getState - verbatimMode <- sVerbatimMode <$> getState - unless (expanded || verbatimMode) $ do - getInput >>= doMacros' 1 >>= setInput - updateState $ \st -> st{ sExpanded = True } + st <- getState + unless (sVerbatimMode st) $ + getInput >>= doMacros' 1 >>= setInput doMacros' :: PandocMonad m => Int -> [Tok] -> LP m [Tok] doMacros' n inp = case inp of Tok spos (CtrlSeq "begin") _ : Tok _ Symbol "{" : Tok _ Word name : Tok _ Symbol "}" : ts - -> handleMacros n spos name ts + -> handleMacros n spos name ts <|> return inp Tok spos (CtrlSeq "end") _ : Tok _ Symbol "{" : Tok _ Word name : Tok _ Symbol "}" : ts - -> handleMacros n spos ("end" <> name) ts + -> handleMacros n spos ("end" <> name) ts <|> return inp Tok _ (CtrlSeq "expandafter") _ : t : ts -> combineTok t <$> doMacros' n ts Tok spos (CtrlSeq name) _ : ts - -> handleMacros n spos name ts + -> handleMacros n spos name ts <|> return inp _ -> return inp - <|> return inp where combineTok (Tok spos (CtrlSeq name) x) (Tok _ Word w : ts) @@ -482,7 +554,7 @@ doMacros' n inp = $ throwError $ PandocMacroLoop name macros <- sMacros <$> getState case M.lookup name macros of - Nothing -> mzero + Nothing -> trySpecialMacro name ts Just (Macro expansionPoint argspecs optarg newtoks) -> do let getargs' = do args <- @@ -510,6 +582,41 @@ doMacros' n inp = ExpandWhenUsed -> doMacros' (n' + 1) result ExpandWhenDefined -> return result +-- | Certain macros do low-level tex manipulations that can't +-- be represented in our Macro type, so we handle them here. +trySpecialMacro :: PandocMonad m => Text -> [Tok] -> LP m [Tok] +trySpecialMacro "xspace" ts = do + ts' <- doMacros' 1 ts + case ts' of + Tok pos Word t : _ + | startsWithAlphaNum t -> return $ Tok pos Spaces " " : ts' + _ -> return ts' +trySpecialMacro "iftrue" ts = handleIf True ts +trySpecialMacro "iffalse" ts = handleIf False ts +trySpecialMacro _ _ = mzero + +handleIf :: PandocMonad m => Bool -> [Tok] -> LP m [Tok] +handleIf b ts = do + res' <- lift $ runParserT (ifParser b) defaultLaTeXState "tokens" ts + case res' of + Left _ -> Prelude.fail "Could not parse conditional" + Right ts' -> return ts' + +ifParser :: PandocMonad m => Bool -> LP m [Tok] +ifParser b = do + ifToks <- many (notFollowedBy (controlSeq "else" <|> controlSeq "fi") + *> anyTok) + elseToks <- (controlSeq "else" >> manyTill anyTok (controlSeq "fi")) + <|> ([] <$ controlSeq "fi") + rest <- getInput + return $ (if b then ifToks else elseToks) ++ rest + +startsWithAlphaNum :: Text -> Bool +startsWithAlphaNum t = + case T.uncons t of + Just (c, _) | isAlphaNum c -> True + _ -> False + setpos :: SourcePos -> Tok -> Tok setpos spos (Tok _ tt txt) = Tok spos tt txt @@ -592,18 +699,22 @@ isCommentTok _ = False anyTok :: PandocMonad m => LP m Tok anyTok = satisfyTok (const True) +singleCharTok :: PandocMonad m => LP m Tok +singleCharTok = + satisfyTok $ \case + Tok _ Word t -> T.length t == 1 + Tok _ Symbol t -> not (T.any (`Set.member` specialChars) t) + _ -> False + singleChar :: PandocMonad m => LP m Tok -singleChar = try $ do - Tok pos toktype t <- satisfyTok (tokTypeIn [Word, Symbol]) - guard $ not $ toktype == Symbol && - T.any (`Set.member` specialChars) t - if T.length t > 1 - then do - let (t1, t2) = (T.take 1 t, T.drop 1 t) - inp <- getInput - setInput $ Tok (incSourceColumn pos 1) toktype t2 : inp - return $ Tok pos toktype t1 - else return $ Tok pos toktype t +singleChar = singleCharTok <|> singleCharFromWord + where + singleCharFromWord = do + Tok pos toktype t <- disablingWithRaw $ satisfyTok isWordTok + let (t1, t2) = (T.take 1 t, T.drop 1 t) + inp <- getInput + setInput $ Tok pos toktype t1 : Tok (incSourceColumn pos 1) toktype t2 : inp + anyTok specialChars :: Set.Set Char specialChars = Set.fromList "#$%&~_^\\{}" @@ -646,28 +757,25 @@ grouped parser = try $ do -- {{a,b}} should be parsed the same as {a,b} try (grouped parser <* egroup) <|> (mconcat <$> manyTill parser egroup) -braced' :: PandocMonad m => LP m Tok -> Int -> LP m [Tok] -braced' getTok n = - handleEgroup <|> handleBgroup <|> handleOther - where handleEgroup = do - t <- symbol '}' - if n == 1 - then return [] - else (t:) <$> braced' getTok (n - 1) - handleBgroup = do - t <- symbol '{' - (t:) <$> braced' getTok (n + 1) - handleOther = do - t <- getTok - (t:) <$> braced' getTok n +braced' :: PandocMonad m => LP m Tok -> LP m [Tok] +braced' getTok = symbol '{' *> go (1 :: Int) + where + go n = do + t <- getTok + case t of + Tok _ Symbol "}" + | n > 1 -> (t:) <$> go (n - 1) + | otherwise -> return [] + Tok _ Symbol "{" -> (t:) <$> go (n + 1) + _ -> (t:) <$> go n braced :: PandocMonad m => LP m [Tok] -braced = symbol '{' *> braced' anyTok 1 +braced = braced' anyTok -- URLs require special handling, because they can contain % -- characters. So we retonenize comments as we go... bracedUrl :: PandocMonad m => LP m [Tok] -bracedUrl = bgroup *> braced' (retokenizeComment >> anyTok) 1 +bracedUrl = braced' (retokenizeComment >> anyTok) -- For handling URLs, which allow literal % characters... retokenizeComment :: PandocMonad m => LP m () @@ -723,16 +831,29 @@ ignore raw = do withRaw :: PandocMonad m => LP m a -> LP m (a, [Tok]) withRaw parser = do - inp <- getInput + rawTokensMap <- sRawTokens <$> getState + let key = case IntMap.lookupMax rawTokensMap of + Nothing -> 0 + Just (n,_) -> n + 1 + -- insert empty list at key + updateState $ \st -> st{ sRawTokens = + IntMap.insert key [] $ sRawTokens st } result <- parser - nxtpos <- option Nothing ((\(Tok pos' _ _) -> Just pos') <$> lookAhead anyTok) - let raw = takeWhile (\(Tok pos _ _) -> maybe True - (\p -> sourceName p /= sourceName pos || pos < p) nxtpos) inp + mbRevToks <- IntMap.lookup key . sRawTokens <$> getState + raw <- case mbRevToks of + Just revtoks -> do + updateState $ \st -> st{ sRawTokens = + IntMap.delete key $ sRawTokens st} + return $ reverse revtoks + Nothing -> + throwError $ PandocShouldNeverHappenError $ + "sRawTokens has nothing at key " <> T.pack (show key) return (result, raw) keyval :: PandocMonad m => LP m (Text, Text) keyval = try $ do - Tok _ Word key <- satisfyTok isWordTok + key <- untokenize <$> many1 (notFollowedBy (symbol '=') >> + (symbol '-' <|> symbol '_' <|> satisfyTok isWordTok)) sp val <- option mempty $ do symbol '=' @@ -792,7 +913,7 @@ getRawCommand name txt = do (_, rawargs) <- withRaw $ case name of "write" -> do - void $ satisfyTok isWordTok -- digits + void $ many $ satisfyTok isDigitTok -- digits void braced "titleformat" -> do void braced @@ -807,6 +928,10 @@ getRawCommand name txt = do void $ many braced return $ txt <> untokenize rawargs +isDigitTok :: Tok -> Bool +isDigitTok (Tok _ Word t) = T.all isDigit t +isDigitTok _ = False + skipopts :: PandocMonad m => LP m () skipopts = skipMany (void overlaySpecification <|> void rawopt) @@ -874,3 +999,35 @@ getNextNumber getCurrentNum = do Just n -> [n, 1] Nothing -> [1] +label :: PandocMonad m => LP m () +label = do + controlSeq "label" + t <- braced + updateState $ \st -> st{ sLastLabel = Just $ untokenize t } + +setCaption :: PandocMonad m => LP m Inlines -> LP m () +setCaption inline = try $ do + skipopts + ils <- tokWith inline + optional $ try $ spaces *> label + updateState $ \st -> st{ sCaption = Just ils } + +resetCaption :: PandocMonad m => LP m () +resetCaption = updateState $ \st -> st{ sCaption = Nothing + , sLastLabel = Nothing } + +env :: PandocMonad m => Text -> LP m a -> LP m a +env name p = p <* end_ name + +tokWith :: PandocMonad m => LP m Inlines -> LP m Inlines +tokWith inlineParser = try $ spaces >> + grouped inlineParser + <|> (lookAhead anyControlSeq >> inlineParser) + <|> singleChar' + where singleChar' = do + Tok _ _ t <- singleChar + return $ str t + +addMeta :: PandocMonad m => ToMetaValue a => Text -> a -> LP m () +addMeta field val = updateState $ \st -> + st{ sMeta = addMetaField field val $ sMeta st } diff --git a/src/Text/Pandoc/Readers/LaTeX/SIunitx.hs b/src/Text/Pandoc/Readers/LaTeX/SIunitx.hs index db9c276e7..b8bf0ce7f 100644 --- a/src/Text/Pandoc/Readers/LaTeX/SIunitx.hs +++ b/src/Text/Pandoc/Readers/LaTeX/SIunitx.hs @@ -1,12 +1,7 @@ +{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE OverloadedStrings #-} module Text.Pandoc.Readers.LaTeX.SIunitx - ( dosi - , doSI - , doSIrange - , doSInum - , doSInumlist - , doSIang - ) + ( siunitxCommands ) where import Text.Pandoc.Builder import Text.Pandoc.Readers.LaTeX.Parsing @@ -15,14 +10,32 @@ import Text.Pandoc.Class import Text.Pandoc.Parsing hiding (blankline, mathDisplay, mathInline, optional, space, spaces, withRaw, (<|>)) import Control.Applicative ((<|>)) +import Control.Monad (void) import qualified Data.Map as M import Data.Char (isDigit) import Data.Text (Text) import qualified Data.Text as T import Data.List (intersperse) +import qualified Data.Sequence as Seq +import Text.Pandoc.Walk (walk) + +siunitxCommands :: PandocMonad m + => LP m Inlines -> M.Map Text (LP m Inlines) +siunitxCommands tok = M.fromList + [ ("si", dosi tok) + , ("SI", doSI tok) + , ("SIrange", doSIrange True tok) + , ("numrange", doSIrange False tok) + , ("numlist", doSInumlist) + , ("SIlist", doSIlist tok) + , ("num", doSInum) + , ("ang", doSIang) + ] dosi :: PandocMonad m => LP m Inlines -> LP m Inlines -dosi tok = grouped (siUnit tok) <|> siUnit tok +dosi tok = do + options <- option [] keyvals + grouped (siUnit options tok) <|> siUnit options tok -- converts e.g. \SI{1}[\$]{} to "$ 1" or \SI{1}{\euro} to "1 €" doSI :: PandocMonad m => LP m Inlines -> LP m Inlines @@ -57,23 +70,50 @@ doSInumlist = do mconcat (intersperse (str "," <> space) (init xs)) <> text ", & " <> last xs +doSIlist :: PandocMonad m => LP m Inlines -> LP m Inlines +doSIlist tok = do + options <- option [] keyvals + nums <- map tonum . T.splitOn ";" . untokenize <$> braced + unit <- grouped (siUnit options tok) <|> siUnit options tok + let xs = map (<> (str "\xa0" <> unit)) nums + case xs of + [] -> return mempty + [x] -> return x + _ -> return $ + mconcat (intersperse (str "," <> space) (init xs)) <> + text ", & " <> last xs + parseNum :: Parser Text () Inlines parseNum = (mconcat <$> many parseNumPart) <* eof +minus :: Text +minus = "\x2212" + +hyphenToMinus :: Inline -> Inline +hyphenToMinus (Str t) = Str (T.replace "-" minus t) +hyphenToMinus x = x + parseNumPart :: Parser Text () Inlines parseNumPart = parseDecimalNum <|> parseComma <|> parsePlusMinus <|> + parsePM <|> parseI <|> parseExp <|> parseX <|> parseSpace where - parseDecimalNum = do - pref <- option mempty $ (mempty <$ char '+') <|> ("\x2212" <$ char '-') - basenum <- (pref <>) . T.pack - <$> many1 (satisfy (\c -> isDigit c || c == '.')) + parseDecimalNum, parsePlusMinus, parsePM, + parseComma, parseI, parseX, + parseExp, parseSpace :: Parser Text () Inlines + parseDecimalNum = try $ do + pref <- option mempty $ (mempty <$ char '+') <|> (minus <$ char '-') + basenum' <- many1 (satisfy (\c -> isDigit c || c == '.')) + let basenum = pref <> T.pack + (case basenum' of + '.':_ -> '0':basenum' + _ -> basenum') uncertainty <- option mempty $ T.pack <$> parseParens if T.null uncertainty then return $ str basenum @@ -91,6 +131,7 @@ parseNumPart = | otherwise -> "." <> t parseComma = str "." <$ char ',' parsePlusMinus = str "\xa0\xb1\xa0" <$ try (string "+-") + parsePM = str "\xa0\xb1\xa0" <$ try (string "\\pm") parseParens = char '(' *> many1 (satisfy (\c -> isDigit c || c == '.')) <* char ')' parseI = str "i" <$ char 'i' @@ -103,11 +144,14 @@ doSIang :: PandocMonad m => LP m Inlines doSIang = do skipopts ps <- T.splitOn ";" . untokenize <$> braced + let dropPlus t = case T.uncons t of + Just ('+',t') -> t' + _ -> t case ps ++ repeat "" of (d:m:s:_) -> return $ - (if T.null d then mempty else str d <> str "\xb0") <> - (if T.null m then mempty else str m <> str "\x2032") <> - (if T.null s then mempty else str s <> str "\x2033") + (if T.null d then mempty else str (dropPlus d) <> str "\xb0") <> + (if T.null m then mempty else str (dropPlus m) <> str "\x2032") <> + (if T.null s then mempty else str (dropPlus s) <> str "\x2033") _ -> return mempty -- converts e.g. \SIrange{100}{200}{\ms} to "100 ms--200 ms" @@ -136,40 +180,99 @@ doSIrange includeUnits tok = do emptyOr160 :: Inlines -> Inlines emptyOr160 x = if x == mempty then x else str "\160" -siUnit :: PandocMonad m => LP m Inlines -> LP m Inlines -siUnit tok = try (do - Tok _ (CtrlSeq name) _ <- anyControlSeq - case name of - "square" -> do - unit <- siUnit tok - return $ unit <> superscript "2" - "cubic" -> do - unit <- siUnit tok - return $ unit <> superscript "3" - "raisetothe" -> do - n <- tok - unit <- siUnit tok - return $ unit <> superscript n - _ -> - case M.lookup name siUnitMap of - Just il -> - option il $ - choice - [ (il <> superscript "2") <$ controlSeq "squared" - , (il <> superscript "3") <$ controlSeq "cubed" - , (\n -> il <> superscript n) <$> (controlSeq "tothe" *> tok) - ] - Nothing -> fail "not an siunit unit command") - <|> (lookAhead anyControlSeq >> tok) - <|> (do Tok _ Word t <- satisfyTok isWordTok - return $ str t) - <|> (symbol '^' *> (superscript <$> tok)) - <|> (symbol '_' *> (subscript <$> tok)) - <|> ("\xa0" <$ symbol '.') - <|> ("\xa0" <$ symbol '~') - <|> tok - <|> (do Tok _ _ t <- anyTok - return (str t)) +siUnit :: forall m. PandocMonad m => [(Text,Text)] -> LP m Inlines -> LP m Inlines +siUnit options tok = mconcat . intersperse (str "\xa0") <$> many1 siUnitPart + where + siUnitPart :: LP m Inlines + siUnitPart = try $ do + skipMany (void (symbol '.') <|> void (symbol '~') <|> spaces1) + x <- ((siPrefix <*> siBase) + <|> (do u <- siBase <|> tok + option u $ siSuffix <*> pure u)) + option x (siInfix x) + siInfix :: Inlines -> LP m Inlines + siInfix u1 = try $ + (do _ <- controlSeq "per" + u2 <- siUnitPart + let useSlash = lookup "per-mode" options == Just "symbol" + if useSlash + then return (u1 <> str "/" <> u2) + else return (u1 <> str "\xa0" <> negateExponent u2)) + <|> (do _ <- symbol '/' + u2 <- siUnitPart + return (u1 <> str "/" <> u2)) + siPrefix :: LP m (Inlines -> Inlines) + siPrefix = + (do _ <- controlSeq "square" + skipopts + return (<> superscript "2")) + <|> (do _ <- controlSeq "cubic" + skipopts + return (<> superscript "3")) + <|> (do _ <- controlSeq "raisetothe" + skipopts + n <- walk hyphenToMinus <$> tok + return (<> superscript n)) + siSuffix :: LP m (Inlines -> Inlines) + siSuffix = + (do _ <- controlSeq "squared" + skipopts + return (<> superscript "2")) + <|> (do _ <- controlSeq "cubed" + skipopts + return (<> superscript "3")) + <|> (do _ <- controlSeq "tothe" + skipopts + n <- walk hyphenToMinus <$> tok + return (<> superscript n)) + <|> (symbol '^' *> (do n <- walk hyphenToMinus <$> tok + return (<> superscript n))) + <|> (symbol '_' *> (do n <- walk hyphenToMinus <$> tok + return (<> subscript n))) + negateExponent :: Inlines -> Inlines + negateExponent ils = + case Seq.viewr (unMany ils) of + xs Seq.:> Superscript ss -> (Many xs) <> + superscript (str minus <> fromList ss) + _ -> ils <> superscript (str (minus <> "1")) + siBase :: LP m Inlines + siBase = + ((try + (do Tok _ (CtrlSeq name) _ <- anyControlSeq + case M.lookup name siUnitModifierMap of + Just il -> (il <>) <$> siBase + Nothing -> + case M.lookup name siUnitMap of + Just il -> pure il + Nothing -> fail "not a unit command")) + <|> (do Tok _ Word t <- satisfyTok isWordTok + return $ str t) + ) + +siUnitModifierMap :: M.Map Text Inlines +siUnitModifierMap = M.fromList + [ ("atto", str "a") + , ("centi", str "c") + , ("deca", str "d") + , ("deci", str "d") + , ("deka", str "d") + , ("exa", str "E") + , ("femto", str "f") + , ("giga", str "G") + , ("hecto", str "h") + , ("kilo", str "k") + , ("mega", str "M") + , ("micro", str "μ") + , ("milli", str "m") + , ("nano", str "n") + , ("peta", str "P") + , ("pico", str "p") + , ("tera", str "T") + , ("yocto", str "y") + , ("yotta", str "Y") + , ("zepto", str "z") + , ("zetta", str "Z") + ] siUnitMap :: M.Map Text Inlines siUnitMap = M.fromList @@ -269,7 +372,6 @@ siUnitMap = M.fromList , ("arcsecond", str "″") , ("astronomicalunit", str "ua") , ("atomicmassunit", str "u") - , ("atto", str "a") , ("bar", str "bar") , ("barn", str "b") , ("becquerel", str "Bq") @@ -277,51 +379,38 @@ siUnitMap = M.fromList , ("bohr", emph (str "a") <> subscript (str "0")) , ("candela", str "cd") , ("celsius", str "°C") - , ("centi", str "c") , ("clight", emph (str "c") <> subscript (str "0")) , ("coulomb", str "C") , ("dalton", str "Da") , ("day", str "d") - , ("deca", str "d") - , ("deci", str "d") , ("decibel", str "db") , ("degreeCelsius",str "°C") , ("degree", str "°") - , ("deka", str "d") , ("electronmass", emph (str "m") <> subscript (str "e")) , ("electronvolt", str "eV") , ("elementarycharge", emph (str "e")) - , ("exa", str "E") , ("farad", str "F") - , ("femto", str "f") - , ("giga", str "G") , ("gram", str "g") , ("gray", str "Gy") , ("hartree", emph (str "E") <> subscript (str "h")) , ("hectare", str "ha") - , ("hecto", str "h") , ("henry", str "H") , ("hertz", str "Hz") , ("hour", str "h") , ("joule", str "J") , ("katal", str "kat") , ("kelvin", str "K") - , ("kilo", str "k") , ("kilogram", str "kg") , ("knot", str "kn") , ("liter", str "L") , ("litre", str "l") , ("lumen", str "lm") , ("lux", str "lx") - , ("mega", str "M") , ("meter", str "m") , ("metre", str "m") - , ("micro", str "μ") - , ("milli", str "m") , ("minute", str "min") , ("mmHg", str "mmHg") , ("mole", str "mol") - , ("nano", str "n") , ("nauticalmile", str "M") , ("neper", str "Np") , ("newton", str "N") @@ -329,25 +418,17 @@ siUnitMap = M.fromList , ("Pa", str "Pa") , ("pascal", str "Pa") , ("percent", str "%") - , ("per", str "/") - , ("peta", str "P") - , ("pico", str "p") , ("planckbar", emph (str "\x210f")) , ("radian", str "rad") , ("second", str "s") , ("siemens", str "S") , ("sievert", str "Sv") , ("steradian", str "sr") - , ("tera", str "T") , ("tesla", str "T") , ("tonne", str "t") , ("volt", str "V") , ("watt", str "W") , ("weber", str "Wb") - , ("yocto", str "y") - , ("yotta", str "Y") - , ("zepto", str "z") - , ("zetta", str "Z") ] diff --git a/src/Text/Pandoc/Readers/LaTeX/Table.hs b/src/Text/Pandoc/Readers/LaTeX/Table.hs new file mode 100644 index 000000000..f56728fe1 --- /dev/null +++ b/src/Text/Pandoc/Readers/LaTeX/Table.hs @@ -0,0 +1,379 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE OverloadedStrings #-} +module Text.Pandoc.Readers.LaTeX.Table + ( tableEnvironments ) +where + +import Data.Functor (($>)) +import Text.Pandoc.Class +import Text.Pandoc.Readers.LaTeX.Parsing +import Text.Pandoc.Readers.LaTeX.Types +import Text.Pandoc.Builder as B +import qualified Data.Map as M +import Data.Text (Text) +import Data.Maybe (fromMaybe) +import qualified Data.Text as T +import Control.Applicative ((<|>), optional, many) +import Control.Monad (when, void) +import Text.Pandoc.Shared (safeRead, trim) +import Text.Pandoc.Logging (LogMessage(SkippedContent)) +import Text.Pandoc.Walk (walkM) +import Text.Pandoc.Parsing hiding (blankline, many, mathDisplay, mathInline, + optional, space, spaces, withRaw, (<|>)) + +tableEnvironments :: PandocMonad m + => LP m Blocks + -> LP m Inlines + -> M.Map Text (LP m Blocks) +tableEnvironments blocks inline = + M.fromList + [ ("longtable", env "longtable" $ + resetCaption *> + simpTable blocks inline "longtable" False >>= addTableCaption) + , ("table", env "table" $ + skipopts *> resetCaption *> blocks >>= addTableCaption) + , ("tabular*", env "tabular*" $ simpTable blocks inline "tabular*" True) + , ("tabularx", env "tabularx" $ simpTable blocks inline "tabularx" True) + , ("tabular", env "tabular" $ simpTable blocks inline "tabular" False) + ] + +hline :: PandocMonad m => LP m () +hline = try $ do + spaces + controlSeq "hline" <|> + (controlSeq "cline" <* braced) <|> + -- booktabs rules: + controlSeq "toprule" <|> + controlSeq "bottomrule" <|> + controlSeq "midrule" <|> + controlSeq "endhead" <|> + controlSeq "endfirsthead" + spaces + optional rawopt + return () + +lbreak :: PandocMonad m => LP m Tok +lbreak = (controlSeq "\\" <|> controlSeq "tabularnewline") + <* skipopts <* spaces + +amp :: PandocMonad m => LP m Tok +amp = symbol '&' + +-- Split a Word into individual Symbols (for parseAligns) +splitWordTok :: PandocMonad m => LP m () +splitWordTok = do + inp <- getInput + case inp of + (Tok spos Word t : rest) -> + setInput $ map (Tok spos Symbol . T.singleton) (T.unpack t) <> rest + _ -> return () + +parseAligns :: PandocMonad m => LP m [(Alignment, ColWidth, ([Tok], [Tok]))] +parseAligns = try $ do + let maybeBar = skipMany + (try $ sp *> (() <$ symbol '|' <|> () <$ (symbol '@' >> braced))) + let cAlign = AlignCenter <$ symbol 'c' + let lAlign = AlignLeft <$ symbol 'l' + let rAlign = AlignRight <$ symbol 'r' + let parAlign = AlignLeft <$ symbol 'p' + -- aligns from tabularx + let xAlign = AlignLeft <$ symbol 'X' + let mAlign = AlignLeft <$ symbol 'm' + let bAlign = AlignLeft <$ symbol 'b' + let alignChar = splitWordTok *> ( cAlign <|> lAlign <|> rAlign <|> parAlign + <|> xAlign <|> mAlign <|> bAlign ) + let alignPrefix = symbol '>' >> braced + let alignSuffix = symbol '<' >> braced + let colWidth = try $ do + symbol '{' + ds <- trim . untokenize <$> manyTill anyTok (controlSeq "linewidth") + spaces + symbol '}' + return $ safeRead ds + let alignSpec = do + pref <- option [] alignPrefix + spaces + al <- alignChar + width <- colWidth <|> option Nothing (do s <- untokenize <$> braced + pos <- getPosition + report $ SkippedContent s pos + return Nothing) + spaces + suff <- option [] alignSuffix + return (al, width, (pref, suff)) + let starAlign = do -- '*{2}{r}' == 'rr', we just expand like a macro + symbol '*' + spaces + ds <- trim . untokenize <$> braced + spaces + spec <- braced + case safeRead ds of + Just n -> + getInput >>= setInput . (mconcat (replicate n spec) ++) + Nothing -> Prelude.fail $ "Could not parse " <> T.unpack ds <> " as number" + bgroup + spaces + maybeBar + aligns' <- many $ try $ spaces >> optional starAlign >> + (alignSpec <* maybeBar) + spaces + egroup + spaces + return $ map toSpec aligns' + where + toColWidth (Just w) | w > 0 = ColWidth w + toColWidth _ = ColWidthDefault + toSpec (x, y, z) = (x, toColWidth y, z) + +-- N.B. this parser returns a Row that may have erroneous empty cells +-- in it. See the note above fixTableHead for details. +parseTableRow :: PandocMonad m + => LP m Blocks -- ^ block parser + -> LP m Inlines -- ^ inline parser + -> Text -- ^ table environment name + -> [([Tok], [Tok])] -- ^ pref/suffixes + -> LP m Row +parseTableRow blocks inline envname prefsufs = do + notFollowedBy (spaces *> end_ envname) + -- contexts that can contain & that is not colsep: + let canContainAmp (Tok _ (CtrlSeq "begin") _) = True + canContainAmp (Tok _ (CtrlSeq "verb") _) = True + canContainAmp (Tok _ (CtrlSeq "Verb") _) = True + canContainAmp _ = False + -- add prefixes and suffixes in token stream: + let celltoks (pref, suff) = do + prefpos <- getPosition + contents <- mconcat <$> + many ( snd <$> withRaw + ((lookAhead (controlSeq "parbox") >> + void blocks) -- #5711 + <|> + (lookAhead (satisfyTok canContainAmp) >> void inline) + <|> + (lookAhead (symbol '$') >> void inline)) + <|> + (do notFollowedBy + (() <$ amp <|> () <$ lbreak <|> end_ envname) + count 1 anyTok) ) + + suffpos <- getPosition + option [] (count 1 amp) + return $ map (setpos prefpos) pref ++ contents ++ map (setpos suffpos) suff + rawcells <- mapM celltoks prefsufs + cells <- mapM (parseFromToks (parseTableCell blocks)) rawcells + spaces + return $ Row nullAttr cells + +parseTableCell :: PandocMonad m => LP m Blocks -> LP m Cell +parseTableCell blocks = do + spaces + updateState $ \st -> st{ sInTableCell = True } + cell' <- multicolumnCell blocks + <|> multirowCell blocks + <|> parseSimpleCell + <|> parseEmptyCell + updateState $ \st -> st{ sInTableCell = False } + spaces + return cell' + where + -- The parsing of empty cells is important in LaTeX, especially when dealing + -- with multirow/multicolumn. See #6603. + parseEmptyCell = spaces $> emptyCell + parseSimpleCell = simpleCell <$> (plainify <$> blocks) + + +cellAlignment :: PandocMonad m => LP m Alignment +cellAlignment = skipMany (symbol '|') *> alignment <* skipMany (symbol '|') + where + alignment = do + c <- untoken <$> singleChar + return $ case c of + "l" -> AlignLeft + "r" -> AlignRight + "c" -> AlignCenter + "*" -> AlignDefault + _ -> AlignDefault + +plainify :: Blocks -> Blocks +plainify bs = case toList bs of + [Para ils] -> plain (fromList ils) + _ -> bs + +multirowCell :: PandocMonad m => LP m Blocks -> LP m Cell +multirowCell blocks = controlSeq "multirow" >> do + -- Full prototype for \multirow macro is: + -- \multirow[vpos]{nrows}[bigstruts]{width}[vmove]{text} + -- However, everything except `nrows` and `text` make + -- sense in the context of the Pandoc AST + _ <- optional $ symbol '[' *> cellAlignment <* symbol ']' -- vertical position + nrows <- fmap (fromMaybe 1 . safeRead . untokenize) braced + _ <- optional $ symbol '[' *> manyTill anyTok (symbol ']') -- bigstrut-related + _ <- symbol '{' *> manyTill anyTok (symbol '}') -- Cell width + _ <- optional $ symbol '[' *> manyTill anyTok (symbol ']') -- Length used for fine-tuning + content <- symbol '{' *> (plainify <$> blocks) <* symbol '}' + return $ cell AlignDefault (RowSpan nrows) (ColSpan 1) content + +multicolumnCell :: PandocMonad m => LP m Blocks -> LP m Cell +multicolumnCell blocks = controlSeq "multicolumn" >> do + span' <- fmap (fromMaybe 1 . safeRead . untokenize) braced + alignment <- symbol '{' *> cellAlignment <* symbol '}' + + let singleCell = do + content <- plainify <$> blocks + return $ cell alignment (RowSpan 1) (ColSpan span') content + + -- Two possible contents: either a \multirow cell, or content. + -- E.g. \multicol{1}{c}{\multirow{2}{1em}{content}} + -- Note that a \multirow cell can be nested in a \multicolumn, + -- but not the other way around. See #6603 + let nestedCell = do + (Cell _ _ (RowSpan rs) _ bs) <- multirowCell blocks + return $ cell + alignment + (RowSpan rs) + (ColSpan span') + (fromList bs) + + symbol '{' *> (nestedCell <|> singleCell) <* symbol '}' + +-- LaTeX tables are stored with empty cells underneath multirow cells +-- denoting the grid spaces taken up by them. More specifically, if a +-- cell spans m rows, then it will overwrite all the cells in the +-- columns it spans for (m-1) rows underneath it, requiring padding +-- cells in these places. These padding cells need to be removed for +-- proper table reading. See #6603. +-- +-- These fixTable functions do not otherwise fix up malformed +-- input tables: that is left to the table builder. +fixTableHead :: TableHead -> TableHead +fixTableHead (TableHead attr rows) = TableHead attr rows' + where + rows' = fixTableRows rows + +fixTableBody :: TableBody -> TableBody +fixTableBody (TableBody attr rhc th tb) + = TableBody attr rhc th' tb' + where + th' = fixTableRows th + tb' = fixTableRows tb + +fixTableRows :: [Row] -> [Row] +fixTableRows = fixTableRows' $ repeat Nothing + where + fixTableRows' oldHang (Row attr cells : rs) + = let (newHang, cells') = fixTableRow oldHang cells + rs' = fixTableRows' newHang rs + in Row attr cells' : rs' + fixTableRows' _ [] = [] + +-- The overhang is represented as Just (relative cell dimensions) or +-- Nothing for an empty grid space. +fixTableRow :: [Maybe (ColSpan, RowSpan)] -> [Cell] -> ([Maybe (ColSpan, RowSpan)], [Cell]) +fixTableRow oldHang cells + -- If there's overhang, drop cells until their total width meets the + -- width of the occupied grid spaces (or we run out) + | (n, prefHang, restHang) <- splitHang oldHang + , n > 0 + = let cells' = dropToWidth getCellW n cells + (restHang', cells'') = fixTableRow restHang cells' + in (prefHang restHang', cells'') + -- Otherwise record the overhang of a pending cell and fix the rest + -- of the row + | c@(Cell _ _ h w _):cells' <- cells + = let h' = max 1 h + w' = max 1 w + oldHang' = dropToWidth getHangW w' oldHang + (newHang, cells'') = fixTableRow oldHang' cells' + in (toHang w' h' <> newHang, c : cells'') + | otherwise + = (oldHang, []) + where + getCellW (Cell _ _ _ w _) = w + getHangW = maybe 1 fst + getCS (ColSpan n) = n + + toHang c r + | r > 1 = [Just (c, r)] + | otherwise = replicate (getCS c) Nothing + + -- Take the prefix of the overhang list representing filled grid + -- spaces. Also return the remainder and the length of this prefix. + splitHang = splitHang' 0 id + + splitHang' !n l (Just (c, r):xs) + = splitHang' (n + c) (l . (toHang c (r-1) ++)) xs + splitHang' n l xs = (n, l, xs) + + -- Drop list items until the total width of the dropped items + -- exceeds the passed width. + dropToWidth _ n l | n < 1 = l + dropToWidth wproj n (c:cs) = dropToWidth wproj (n - wproj c) cs + dropToWidth _ _ [] = [] + +simpTable :: PandocMonad m + => LP m Blocks + -> LP m Inlines + -> Text + -> Bool + -> LP m Blocks +simpTable blocks inline envname hasWidthParameter = try $ do + when hasWidthParameter $ () <$ tokWith inline + skipopts + colspecs <- parseAligns + let (aligns, widths, prefsufs) = unzip3 colspecs + optional $ controlSeq "caption" *> setCaption inline + spaces + optional label + spaces + optional lbreak + spaces + skipMany hline + spaces + header' <- option [] . try . fmap (:[]) $ + parseTableRow blocks inline envname prefsufs <* + lbreak <* many1 hline + spaces + rows <- sepEndBy (parseTableRow blocks inline envname prefsufs) + (lbreak <* optional (skipMany hline)) + spaces + optional $ controlSeq "caption" *> setCaption inline + spaces + optional label + spaces + optional lbreak + spaces + lookAhead $ controlSeq "end" -- make sure we're at end + let th = fixTableHead $ TableHead nullAttr header' + let tbs = [fixTableBody $ TableBody nullAttr 0 [] rows] + let tf = TableFoot nullAttr [] + return $ table emptyCaption (zip aligns widths) th tbs tf + +addTableCaption :: PandocMonad m => Blocks -> LP m Blocks +addTableCaption = walkM go + where go (Table attr c spec th tb tf) = do + st <- getState + let mblabel = sLastLabel st + capt <- case (sCaption st, mblabel) of + (Just ils, Nothing) -> return $ caption Nothing (plain ils) + (Just ils, Just lab) -> do + num <- getNextNumber sLastTableNum + setState + st{ sLastTableNum = num + , sLabels = M.insert lab + [Str (renderDottedNum num)] + (sLabels st) } + return $ caption Nothing (plain ils) -- add number?? + (Nothing, _) -> return c + let attr' = case (attr, mblabel) of + ((_,classes,kvs), Just ident) -> + (ident,classes,kvs) + _ -> attr + return $ addAttrDiv attr' $ Table nullAttr capt spec th tb tf + go x = return x + +-- TODO: For now we add a Div to contain table attributes, since +-- most writers don't do anything yet with attributes on Table. +-- This can be removed when that changes. +addAttrDiv :: Attr -> Block -> Block +addAttrDiv ("",[],[]) b = b +addAttrDiv attr b = Div attr [b] diff --git a/src/Text/Pandoc/Readers/LaTeX/Types.hs b/src/Text/Pandoc/Readers/LaTeX/Types.hs index a017a2afb..c20b72bc5 100644 --- a/src/Text/Pandoc/Readers/LaTeX/Types.hs +++ b/src/Text/Pandoc/Readers/LaTeX/Types.hs @@ -1,6 +1,7 @@ +{-# LANGUAGE FlexibleInstances #-} {- | Module : Text.Pandoc.Readers.LaTeX.Types - Copyright : Copyright (C) 2017-2020 John MacFarlane + Copyright : Copyright (C) 2017-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -18,7 +19,9 @@ module Text.Pandoc.Readers.LaTeX.Types ( Tok(..) ) where import Data.Text (Text) -import Text.Parsec.Pos (SourcePos) +import Text.Parsec.Pos (SourcePos, sourceName) +import Text.Pandoc.Sources +import Data.List (groupBy) data TokType = CtrlSeq Text | Spaces | Newline | Symbol | Word | Comment | Esc1 | Esc2 | Arg Int @@ -27,6 +30,16 @@ data TokType = CtrlSeq Text | Spaces | Newline | Symbol | Word | Comment | data Tok = Tok SourcePos TokType Text deriving (Eq, Ord, Show) +instance ToSources [Tok] where + toSources = Sources + . map (\ts -> case ts of + Tok p _ _ : _ -> (p, mconcat $ map tokToText ts) + _ -> error "toSources [Tok] encountered empty group") + . groupBy (\(Tok p1 _ _) (Tok p2 _ _) -> sourceName p1 == sourceName p2) + +tokToText :: Tok -> Text +tokToText (Tok _ _ t) = t + data ExpansionPoint = ExpandWhenDefined | ExpandWhenUsed deriving (Eq, Ord, Show) diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index 21b8feaab..1141af66f 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -20,7 +20,7 @@ import Control.Monad (liftM, mzero, guard, void) import Control.Monad.Trans (lift) import Control.Monad.Except (throwError) import Data.Maybe (catMaybes, isJust) -import Data.List (intersperse, intercalate) +import Data.List (intersperse) import qualified Data.Text as T import Text.Pandoc.Builder as B import Text.Pandoc.Class.PandocMonad (PandocMonad(..), report) @@ -29,9 +29,8 @@ import Text.Pandoc.Logging (LogMessage(..)) import Text.Pandoc.Options import Text.Pandoc.Parsing import Text.Pandoc.Walk (query) -import Text.Pandoc.Shared (crFilter, mapLeft) +import Text.Pandoc.Shared (mapLeft) import Text.Pandoc.Readers.Roff -- TODO explicit imports -import Text.Parsec hiding (tokenPrim) import qualified Text.Parsec as Parsec import Text.Parsec.Pos (updatePosString) import qualified Data.Foldable as Foldable @@ -50,13 +49,20 @@ type ManParser m = ParserT [RoffToken] ManState m -- | Read man (troff) from an input string and return a Pandoc document. -readMan :: PandocMonad m => ReaderOptions -> T.Text -> m Pandoc -readMan opts txt = do - tokenz <- lexRoff (initialPos "input") (crFilter txt) +readMan :: (PandocMonad m, ToSources a) + => ReaderOptions + -> a + -> m Pandoc +readMan opts s = do + let Sources inps = toSources s + tokenz <- mconcat <$> mapM (uncurry lexRoff) inps let state = def {readerOptions = opts} :: ManState + let fixError (PandocParsecError _ e) = PandocParsecError (Sources inps) e + fixError e = e eitherdoc <- readWithMTokens parseMan state (Foldable.toList . unRoffTokens $ tokenz) - either throwError return eitherdoc + either (throwError . fixError) return eitherdoc + readWithMTokens :: PandocMonad m => ParserT [RoffToken] ManState m a -- ^ parser @@ -64,9 +70,10 @@ readWithMTokens :: PandocMonad m -> [RoffToken] -- ^ input -> m (Either PandocError a) readWithMTokens parser state input = - let leftF = PandocParsecError . T.pack . intercalate "\n" $ show <$> input + let leftF = PandocParsecError mempty in mapLeft leftF `liftM` runParserT parser state "source" input + parseMan :: PandocMonad m => ManParser m Pandoc parseMan = do bs <- many parseBlock <* eof @@ -89,7 +96,7 @@ parseBlock = choice [ parseList parseTable :: PandocMonad m => ManParser m Blocks parseTable = do - modifyState $ \st -> st { tableCellsPlain = True } + updateState $ \st -> st { tableCellsPlain = True } let isTbl Tbl{} = True isTbl _ = False Tbl _opts rows pos <- msatisfy isTbl @@ -135,7 +142,7 @@ parseTable = do case res' of Left _ -> Prelude.fail "Could not parse table cell" Right x -> do - modifyState $ \s -> s{ tableCellsPlain = False } + updateState $ \s -> s{ tableCellsPlain = False } return x Right x -> return x @@ -222,7 +229,7 @@ parseTitle = do setMeta "section" (linePartsToInlines y) [x] -> setMeta "title" (linePartsToInlines x) [] -> id - modifyState $ \st -> st{ metadata = adjustMeta $ metadata st } + updateState $ \st -> st{ metadata = adjustMeta $ metadata st } return mempty linePartsToInlines :: [LinePart] -> Inlines diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 5888bf095..2dc7ddf52 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -4,7 +4,7 @@ {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Readers.Markdown - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -21,15 +21,17 @@ module Text.Pandoc.Readers.Markdown ( import Control.Monad import Control.Monad.Except (throwError) import Data.Char (isAlphaNum, isPunctuation, isSpace) -import Data.List (transpose, elemIndex, sortOn) +import Data.List (transpose, elemIndex, sortOn, foldl') +import Data.List.NonEmpty (NonEmpty(..)) import qualified Data.Map as M import Data.Maybe import qualified Data.Set as Set import Data.Text (Text) import qualified Data.Text as T -import qualified Data.Text.Lazy as TL import qualified Data.ByteString.Lazy as BL -import System.FilePath (addExtension, takeExtension) +import System.FilePath (addExtension, takeExtension, takeDirectory) +import qualified System.FilePath.Windows as Windows +import qualified System.FilePath.Posix as Posix import Text.HTML.TagSoup hiding (Row) import Text.Pandoc.Builder (Blocks, Inlines) import qualified Text.Pandoc.Builder as B @@ -45,20 +47,22 @@ import Text.Pandoc.Readers.HTML (htmlInBalanced, htmlTag, isBlockTag, isCommentTag, isInlineTag, isTextTag) import Text.Pandoc.Readers.LaTeX (applyMacros, rawLaTeXBlock, rawLaTeXInline) import Text.Pandoc.Shared -import qualified Text.Pandoc.UTF8 as UTF8 import Text.Pandoc.XML (fromEntities) -import Text.Pandoc.Readers.Metadata (yamlBsToMeta, yamlBsToRefs) +import Text.Pandoc.Readers.Metadata (yamlBsToMeta, yamlBsToRefs, yamlMetaBlock) +-- import Debug.Trace (traceShowId) -type MarkdownParser m = ParserT Text ParserState m +type MarkdownParser m = ParserT Sources ParserState m + +type F = Future ParserState -- | Read markdown from an input string and return a Pandoc document. -readMarkdown :: PandocMonad m +readMarkdown :: (PandocMonad m, ToSources a) => ReaderOptions -- ^ Reader options - -> Text -- ^ String to parse (assuming @'\n'@ line endings) + -> a -- ^ Input -> m Pandoc readMarkdown opts s = do parsed <- readWithM parseMarkdown def{ stateOptions = opts } - (crFilter s <> "\n\n") + (ensureFinalNewlines 3 (toSources s)) case parsed of Right result -> return result Left e -> throwError e @@ -79,7 +83,7 @@ yamlToMeta opts mbfp bstr = do meta <- yamlBsToMeta (fmap B.toMetaValue <$> parseBlocks) bstr setPosition oldPos return $ runF meta defaultParserState - parsed <- readWithM parser def{ stateOptions = opts } "" + parsed <- readWithM parser def{ stateOptions = opts } ("" :: Text) case parsed of Right result -> return result Left e -> throwError e @@ -95,14 +99,12 @@ yamlToRefs :: PandocMonad m -> m [MetaValue] yamlToRefs idpred opts mbfp bstr = do let parser = do - oldPos <- getPosition case mbfp of Nothing -> return () Just fp -> setPosition $ initialPos fp refs <- yamlBsToRefs (fmap B.toMetaValue <$> parseBlocks) idpred bstr - setPosition oldPos return $ runF refs defaultParserState - parsed <- readWithM parser def{ stateOptions = opts } "" + parsed <- readWithM parser def{ stateOptions = opts } ("" :: Text) case parsed of Right result -> return result Left e -> throwError e @@ -145,14 +147,14 @@ inList = do ctx <- stateParserContext <$> getState guard (ctx == ListItemState) -spnl :: PandocMonad m => ParserT Text st m () +spnl :: PandocMonad m => ParserT Sources st m () spnl = try $ do skipSpaces optional newline skipSpaces notFollowedBy (char '\n') -spnl' :: PandocMonad m => ParserT Text st m Text +spnl' :: PandocMonad m => ParserT Sources st m Text spnl' = try $ do xs <- many spaceChar ys <- option "" $ try $ (:) <$> newline @@ -247,51 +249,48 @@ titleBlock :: PandocMonad m => MarkdownParser m () titleBlock = pandocTitleBlock <|> mmdTitleBlock pandocTitleBlock :: PandocMonad m => MarkdownParser m () -pandocTitleBlock = try $ do +pandocTitleBlock = do guardEnabled Ext_pandoc_title_block lookAhead (char '%') - title <- option mempty titleLine - author <- option (return []) authorsLine - date <- option mempty dateLine - optional blanklines - let meta' = do title' <- title - author' <- author - date' <- date - return $ - (if null title' then id else B.setMeta "title" title') - . (if null author' then id else B.setMeta "author" author') - . (if null date' then id else B.setMeta "date" date') - $ nullMeta - updateState $ \st -> st{ stateMeta' = stateMeta' st <> meta' } - -yamlMetaBlock :: PandocMonad m => MarkdownParser m (F Blocks) -yamlMetaBlock = try $ do + try $ do + title <- option mempty titleLine + author <- option (return []) authorsLine + date <- option mempty dateLine + optional blanklines + let meta' = do title' <- title + author' <- author + date' <- date + return $ + (if null title' + then id + else B.setMeta "title" title') + . (if null author' + then id + else B.setMeta "author" author') + . (if null date' + then id + else B.setMeta "date" date') + $ nullMeta + updateState $ \st -> st{ stateMeta' = stateMeta' st <> meta' } + +yamlMetaBlock' :: PandocMonad m => MarkdownParser m (F Blocks) +yamlMetaBlock' = do guardEnabled Ext_yaml_metadata_block - string "---" - blankline - notFollowedBy blankline -- if --- is followed by a blank it's an HRULE - rawYamlLines <- manyTill anyLine stopLine - -- by including --- and ..., we allow yaml blocks with just comments: - let rawYaml = T.unlines ("---" : (rawYamlLines ++ ["..."])) - optional blanklines - newMetaF <- yamlBsToMeta (fmap B.toMetaValue <$> parseBlocks) - $ UTF8.fromTextLazy $ TL.fromStrict rawYaml + newMetaF <- yamlMetaBlock (fmap B.toMetaValue <$> parseBlocks) -- Since `<>` is left-biased, existing values are not touched: updateState $ \st -> st{ stateMeta' = stateMeta' st <> newMetaF } return mempty -stopLine :: PandocMonad m => MarkdownParser m () -stopLine = try $ (string "---" <|> string "...") >> blankline >> return () - mmdTitleBlock :: PandocMonad m => MarkdownParser m () -mmdTitleBlock = try $ do +mmdTitleBlock = do guardEnabled Ext_mmd_title_block - firstPair <- kvPair False - restPairs <- many (kvPair True) - let kvPairs = firstPair : restPairs - blanklines - updateState $ \st -> st{ stateMeta' = stateMeta' st <> - return (Meta $ M.fromList kvPairs) } + try $ do + firstPair <- kvPair False + restPairs <- many (kvPair True) + let kvPairs = firstPair : restPairs + blanklines + updateState $ \st -> st{ stateMeta' = stateMeta' st <> + return (Meta $ M.fromList kvPairs) } kvPair :: PandocMonad m => Bool -> MarkdownParser m (Text, MetaValue) kvPair allowEmpty = try $ do @@ -300,7 +299,7 @@ kvPair allowEmpty = try $ do (try $ newline >> lookAhead (blankline <|> nonspaceChar)) guard $ allowEmpty || not (T.null val) let key' = T.concat $ T.words $ T.toLower key - let val' = MetaBlocks $ B.toList $ B.plain $ B.text val + let val' = MetaInlines $ B.toList $ B.text val return (key',val') parseMarkdown :: PandocMonad m => MarkdownParser m Pandoc @@ -334,10 +333,14 @@ referenceKey = try $ do skipMany spaceChar notFollowedBy' referenceTitle notFollowedBy' $ guardEnabled Ext_link_attributes >> attributes + notFollowedBy' $ guardEnabled Ext_mmd_link_attributes >> + try (spnl <* keyValAttr) notFollowedBy' (() <$ reference) many1Char $ notFollowedBy space >> litChar let betweenAngles = try $ char '<' >> manyTillChar litChar (char '>') - src <- try betweenAngles <|> sourceURL + rebase <- option False (True <$ guardEnabled Ext_rebase_relative_paths) + src <- (if rebase then rebasePath pos else id) <$> + (try betweenAngles <|> sourceURL) tit <- option "" referenceTitle attr <- option nullAttr $ try $ do guardEnabled Ext_link_attributes @@ -346,7 +349,7 @@ referenceKey = try $ do addKvs <- option [] $ guardEnabled Ext_mmd_link_attributes >> many (try $ spnl >> keyValAttr) blanklines - let attr' = extractIdClass $ foldl (\x f -> f x) attr addKvs + let attr' = extractIdClass $ foldl' (\x f -> f x) attr addKvs target = (escapeURI $ trimr src, tit) st <- getState let oldkeys = stateKeys st @@ -442,7 +445,7 @@ block :: PandocMonad m => MarkdownParser m (F Blocks) block = do res <- choice [ mempty <$ blanklines , codeBlockFenced - , yamlMetaBlock + , yamlMetaBlock' -- note: bulletList needs to be before header because of -- the possibility of empty list items: - , bulletList @@ -568,7 +571,7 @@ registerImplicitHeader raw attr@(ident, _, _) -- hrule block -- -hrule :: PandocMonad m => ParserT Text st m (F Blocks) +hrule :: PandocMonad m => ParserT Sources st m (F Blocks) hrule = try $ do skipSpaces start <- satisfy isHruleChar @@ -588,7 +591,7 @@ indentedLine = indentSpaces >> anyLineNewline blockDelimiter :: PandocMonad m => (Char -> Bool) -> Maybe Int - -> ParserT Text ParserState m Int + -> ParserT Sources ParserState m Int blockDelimiter f len = try $ do skipNonindentSpaces c <- lookAhead (satisfy f) @@ -602,7 +605,7 @@ attributes = try $ do spnl attrs <- many (attribute <* spnl) char '}' - return $ foldl (\x f -> f x) nullAttr attrs + return $ foldl' (\x f -> f x) nullAttr attrs attribute :: PandocMonad m => MarkdownParser m (Attr -> Attr) attribute = identifierAttr <|> classAttr <|> keyValAttr <|> specialAttr @@ -659,15 +662,15 @@ codeBlockFenced :: PandocMonad m => MarkdownParser m (F Blocks) codeBlockFenced = try $ do indentchars <- nonindentSpaces let indentLevel = T.length indentchars - c <- try (guardEnabled Ext_fenced_code_blocks >> lookAhead (char '~')) + c <- (guardEnabled Ext_fenced_code_blocks >> lookAhead (char '~')) <|> (guardEnabled Ext_backtick_code_blocks >> lookAhead (char '`')) size <- blockDelimiter (== c) Nothing skipMany spaceChar rawattr <- - (Left <$> try (guardEnabled Ext_raw_attribute >> rawAttribute)) + (Left <$> (guardEnabled Ext_raw_attribute >> try rawAttribute)) <|> (Right <$> option ("",[],[]) - (try (guardEnabled Ext_fenced_code_attributes >> attributes) + ((guardEnabled Ext_fenced_code_attributes >> try attributes) <|> ((\x -> ("",[toLanguageId x],[])) <$> many1Char nonspaceChar))) blankline contents <- T.intercalate "\n" <$> @@ -732,7 +735,7 @@ lhsCodeBlockBirdWith c = try $ do blanklines return $ T.intercalate "\n" lns' -birdTrackLine :: PandocMonad m => Char -> ParserT Text st m Text +birdTrackLine :: PandocMonad m => Char -> ParserT Sources st m Text birdTrackLine c = try $ do char c -- allow html tags on left margin: @@ -1025,7 +1028,7 @@ para = try $ do option (B.plain <$> result) $ try $ do newline - (blanklines >> return mempty) + (mempty <$ blanklines) <|> (guardDisabled Ext_blank_before_blockquote >> () <$ lookAhead blockQuote) <|> (guardEnabled Ext_backtick_code_blocks >> () <$ lookAhead codeBlockFenced) <|> (guardDisabled Ext_blank_before_header >> () <$ lookAhead header) @@ -1118,6 +1121,7 @@ rawTeXBlock = do rawHtmlBlocks :: PandocMonad m => MarkdownParser m (F Blocks) rawHtmlBlocks = do (TagOpen tagtype _, raw) <- htmlTag isBlockTag + let selfClosing = "/>" `T.isSuffixOf` raw -- we don't want '<td> text' to be a code block: skipMany spaceChar indentlevel <- (blankline >> length <$> many (char ' ')) <|> return 0 @@ -1131,7 +1135,9 @@ rawHtmlBlocks = do gobbleAtMostSpaces indentlevel notFollowedBy' closer block - contents <- mconcat <$> many block' + contents <- if selfClosing + then return mempty + else mconcat <$> many block' result <- try (do gobbleAtMostSpaces indentlevel @@ -1155,11 +1161,12 @@ stripMarkdownAttribute s = renderTags' $ map filterAttrib $ parseTags s -- lineBlock :: PandocMonad m => MarkdownParser m (F Blocks) -lineBlock = try $ do +lineBlock = do guardEnabled Ext_line_blocks - lines' <- lineBlockLines >>= - mapM (parseFromString' (trimInlinesF <$> inlines)) - return $ B.lineBlock <$> sequence lines' + try $ do + lines' <- lineBlockLines >>= + mapM (parseFromString' (trimInlinesF <$> inlines)) + return $ B.lineBlock <$> sequence lines' -- -- Tables @@ -1169,7 +1176,7 @@ lineBlock = try $ do -- and the length including trailing space. dashedLine :: PandocMonad m => Char - -> ParserT Text st m (Int, Int) + -> ParserT Sources st m (Int, Int) dashedLine ch = do dashes <- many1 (char ch) sp <- many spaceChar @@ -1238,7 +1245,7 @@ rawTableLine :: PandocMonad m -> MarkdownParser m [Text] rawTableLine indices = do notFollowedBy' (blanklines' <|> tableFooter) - line <- take1WhileP (/='\n') <* newline + line <- anyLine return $ map trim $ tail $ splitTextByIndices (init indices) line @@ -1261,11 +1268,12 @@ multilineRow indices = do -- Parses a table caption: inlines beginning with 'Table:' -- and followed by blank lines. tableCaption :: PandocMonad m => MarkdownParser m (F Inlines) -tableCaption = try $ do +tableCaption = do guardEnabled Ext_table_captions - skipNonindentSpaces - (string ":" <* notFollowedBy (satisfy isPunctuation)) <|> string "Table:" - trimInlinesF <$> inlines1 <* blanklines + try $ do + skipNonindentSpaces + (string ":" <* notFollowedBy (satisfy isPunctuation)) <|> string "Table:" + trimInlinesF <$> inlines1 <* blanklines -- Parse a simple table with '---' header and one line per row. simpleTable :: PandocMonad m @@ -1351,7 +1359,7 @@ pipeTable = try $ do lines' <- many pipeTableRow let lines'' = map (take (length aligns) <$>) lines' let maxlength = maximum $ - map (\x -> T.length . stringify $ runF x def) (heads' : lines'') + fmap (\x -> T.length . stringify $ runF x def) (heads' :| lines'') numColumns <- getOption readerColumns let widths = if maxlength > numColumns then map (\len -> @@ -1388,7 +1396,7 @@ pipeTableCell = return $ B.plain <$> result) <|> return mempty -pipeTableHeaderPart :: PandocMonad m => ParserT Text st m (Alignment, Int) +pipeTableHeaderPart :: PandocMonad m => ParserT Sources st m (Alignment, Int) pipeTableHeaderPart = try $ do skipMany spaceChar left <- optionMaybe (char ':') @@ -1404,10 +1412,14 @@ pipeTableHeaderPart = try $ do (Just _,Just _) -> AlignCenter, len) -- Succeed only if current line contains a pipe. -scanForPipe :: PandocMonad m => ParserT Text st m () +scanForPipe :: PandocMonad m => ParserT Sources st m () scanForPipe = do - inp <- getInput - case T.break (\c -> c == '\n' || c == '|') inp of + Sources inps <- getInput + let ln = case inps of + [] -> "" + ((_,t):(_,t'):_) | T.null t -> t' + ((_,t):_) -> t + case T.break (\c -> c == '\n' || c == '|') ln of (_, T.uncons -> Just ('|', _)) -> return () _ -> mzero @@ -1434,15 +1446,14 @@ table :: PandocMonad m => MarkdownParser m (F Blocks) table = try $ do frontCaption <- option Nothing (Just <$> tableCaption) (aligns, widths, heads, lns) <- - try (guardEnabled Ext_pipe_tables >> scanForPipe >> pipeTable) <|> - try (guardEnabled Ext_multiline_tables >> - multilineTable False) <|> - try (guardEnabled Ext_simple_tables >> - (simpleTable True <|> simpleTable False)) <|> - try (guardEnabled Ext_multiline_tables >> - multilineTable True) <|> - try (guardEnabled Ext_grid_tables >> - (gridTable False <|> gridTable True)) <?> "table" + (guardEnabled Ext_pipe_tables >> try (scanForPipe >> pipeTable)) <|> + (guardEnabled Ext_multiline_tables >> try (multilineTable False)) <|> + (guardEnabled Ext_simple_tables >> + try (simpleTable True <|> simpleTable False)) <|> + (guardEnabled Ext_multiline_tables >> + try (multilineTable True)) <|> + (guardEnabled Ext_grid_tables >> + try (gridTable False <|> gridTable True)) <?> "table" optional blanklines caption <- case frontCaption of Nothing -> option (return mempty) tableCaption @@ -1476,35 +1487,37 @@ inlines1 :: PandocMonad m => MarkdownParser m (F Inlines) inlines1 = mconcat <$> many1 inline inline :: PandocMonad m => MarkdownParser m (F Inlines) -inline = choice [ whitespace - , bareURL - , str - , endline - , code - , strongOrEmph - , note - , cite - , bracketedSpan - , link - , image - , math - , strikeout - , subscript - , superscript - , inlineNote -- after superscript because of ^[link](/foo)^ - , autoLink - , spanHtml - , rawHtmlInline - , escapedNewline - , escapedChar - , rawLaTeXInline' - , exampleRef - , smart - , return . B.singleton <$> charRef - , emoji - , symbol - , ltSign - ] <?> "inline" +inline = do + c <- lookAhead anyChar + ((case c of + ' ' -> whitespace + '\t' -> whitespace + '\n' -> endline + '`' -> code + '_' -> strongOrEmph + '*' -> strongOrEmph + '^' -> superscript <|> inlineNote -- in this order bc ^[link](/foo)^ + '[' -> note <|> cite <|> bracketedSpan <|> link + '!' -> image + '$' -> math + '~' -> strikeout <|> subscript + '<' -> autoLink <|> spanHtml <|> rawHtmlInline <|> ltSign + '\\' -> math <|> escapedNewline <|> escapedChar <|> rawLaTeXInline' + '@' -> cite <|> exampleRef + '"' -> smart + '\'' -> smart + '\8216' -> smart + '\145' -> smart + '\8220' -> smart + '\147' -> smart + '-' -> smart + '.' -> smart + '&' -> return . B.singleton <$> charRef + ':' -> emoji + _ -> mzero) + <|> bareURL + <|> str + <|> symbol) <?> "inline" escapedChar' :: PandocMonad m => MarkdownParser m Char escapedChar' = try $ do @@ -1515,11 +1528,12 @@ escapedChar' = try $ do <|> oneOf "\\`*_{}[]()>#+-.!~\"" escapedNewline :: PandocMonad m => MarkdownParser m (F Inlines) -escapedNewline = try $ do +escapedNewline = do guardEnabled Ext_escaped_line_breaks - char '\\' - lookAhead (char '\n') -- don't consume the newline (see #3730) - return $ return B.linebreak + try $ do + char '\\' + lookAhead (char '\n') -- don't consume the newline (see #3730) + return $ return B.linebreak escapedChar :: PandocMonad m => MarkdownParser m (F Inlines) escapedChar = do @@ -1541,19 +1555,20 @@ ltSign = do -- whole document has been parsed. But we need this parser -- here in case citations is disabled. exampleRef :: PandocMonad m => MarkdownParser m (F Inlines) -exampleRef = try $ do +exampleRef = do guardEnabled Ext_example_lists - char '@' - lab <- mconcat . map T.pack <$> - many (many1 alphaNum <|> - try (do c <- char '_' <|> char '-' - cs <- many1 alphaNum - return (c:cs))) - return $ do - st <- askF - return $ case M.lookup lab (stateExamples st) of - Just n -> B.str $ tshow n - Nothing -> B.str $ "@" <> lab + try $ do + char '@' + lab <- mconcat . map T.pack <$> + many (many1 alphaNum <|> + try (do c <- char '_' <|> char '-' + cs <- many1 alphaNum + return (c:cs))) + return $ do + st <- askF + return $ case M.lookup lab (stateExamples st) of + Just n -> B.str $ tshow n + Nothing -> B.str $ "@" <> lab symbol :: PandocMonad m => MarkdownParser m (F Inlines) symbol = do @@ -1580,10 +1595,10 @@ code = try $ do >> count (length starts) (char '`') >> notFollowedBy (char '`')) rawattr <- - (Left <$> try (guardEnabled Ext_raw_attribute >> rawAttribute)) + (Left <$> (guardEnabled Ext_raw_attribute >> try rawAttribute)) <|> (Right <$> option ("",[],[]) - (try (guardEnabled Ext_inline_code_attributes >> attributes))) + (guardEnabled Ext_inline_code_attributes >> try attributes)) return $ return $ case rawattr of Left syn -> B.rawInline syn result @@ -1676,38 +1691,40 @@ strikeout = fmap B.strikeout <$> strikeEnd = try $ string "~~" superscript :: PandocMonad m => MarkdownParser m (F Inlines) -superscript = fmap B.superscript <$> try (do +superscript = do guardEnabled Ext_superscript - char '^' - mconcat <$> many1Till (do notFollowedBy spaceChar - notFollowedBy newline - inline) (char '^')) + fmap B.superscript <$> try (do + char '^' + mconcat <$> many1Till (do notFollowedBy spaceChar + notFollowedBy newline + inline) (char '^')) subscript :: PandocMonad m => MarkdownParser m (F Inlines) -subscript = fmap B.subscript <$> try (do +subscript = do guardEnabled Ext_subscript - char '~' - mconcat <$> many1Till (do notFollowedBy spaceChar - notFollowedBy newline - inline) (char '~')) + fmap B.subscript <$> try (do + char '~' + mconcat <$> many1Till (do notFollowedBy spaceChar + notFollowedBy newline + inline) (char '~')) whitespace :: PandocMonad m => MarkdownParser m (F Inlines) whitespace = spaceChar >> return <$> (lb <|> regsp) <?> "whitespace" where lb = spaceChar >> skipMany spaceChar >> option B.space (endline >> return B.linebreak) regsp = skipMany spaceChar >> return B.space -nonEndline :: PandocMonad m => ParserT Text st m Char +nonEndline :: PandocMonad m => ParserT Sources st m Char nonEndline = satisfy (/='\n') str :: PandocMonad m => MarkdownParser m (F Inlines) str = do result <- mconcat <$> many1 - ( take1WhileP isAlphaNum + ( T.pack <$> (many1 alphaNum) <|> "." <$ try (char '.' <* notFollowedBy (char '.')) ) updateLastStrPos (do guardEnabled Ext_smart abbrevs <- getOption readerAbbreviations - if not (T.null result) && T.last result == '.' && result `Set.member` abbrevs + if result `Set.member` abbrevs then try (do ils <- whitespace notFollowedBy (() <$ cite <|> () <$ note) -- ?? lookAhead alphaNum @@ -1790,15 +1807,16 @@ link = try $ do regLink B.linkWith lab <|> referenceLink B.linkWith (lab,raw) bracketedSpan :: PandocMonad m => MarkdownParser m (F Inlines) -bracketedSpan = try $ do +bracketedSpan = do guardEnabled Ext_bracketed_spans - (lab,_) <- reference - attr <- attributes - return $ if isSmallCaps attr - then B.smallcaps <$> lab - else if isUnderline attr - then B.underline <$> lab - else B.spanWith attr <$> lab + try $ do + (lab,_) <- reference + attr <- attributes + return $ if isSmallCaps attr + then B.smallcaps <$> lab + else if isUnderline attr + then B.underline <$> lab + else B.spanWith attr <$> lab -- | We treat a span as SmallCaps if class is "smallcaps" (and -- no other attributes are set or if style is "font-variant:small-caps" @@ -1825,9 +1843,12 @@ regLink :: PandocMonad m -> MarkdownParser m (F Inlines) regLink constructor lab = try $ do (src, tit) <- source + rebase <- option False (True <$ guardEnabled Ext_rebase_relative_paths) + pos <- getPosition + let src' = if rebase then rebasePath pos src else src attr <- option nullAttr $ guardEnabled Ext_link_attributes >> attributes - return $ constructor attr src tit <$> lab + return $ constructor attr src' tit <$> lab -- a link like [this][ref] or [this][] or [this] referenceLink :: PandocMonad m @@ -1867,7 +1888,8 @@ referenceLink constructor (lab, raw) = do Just ((src, tit), _) -> constructor nullAttr src tit <$> lab Nothing -> makeFallback else makeFallback - Just ((src,tit), attr) -> constructor attr src tit <$> lab + Just ((src,tit), attr) -> + constructor attr src tit <$> lab dropBrackets :: Text -> Text dropBrackets = dropRB . dropLB @@ -1877,12 +1899,13 @@ dropBrackets = dropRB . dropLB dropLB xs = xs bareURL :: PandocMonad m => MarkdownParser m (F Inlines) -bareURL = try $ do +bareURL = do guardEnabled Ext_autolink_bare_uris getState >>= guard . stateAllowLinks - (cls, (orig, src)) <- (("uri",) <$> uri) <|> (("email",) <$> emailAddress) - notFollowedBy $ try $ spaces >> htmlTag (~== TagClose ("a" :: Text)) - return $ return $ B.linkWith ("",[cls],[]) src "" (B.str orig) + try $ do + (cls, (orig, src)) <- (("uri",) <$> uri) <|> (("email",) <$> emailAddress) + notFollowedBy $ try $ spaces >> htmlTag (~== TagClose ("a" :: Text)) + return $ return $ B.linkWith ("",[cls],[]) src "" (B.str orig) autoLink :: PandocMonad m => MarkdownParser m (F Inlines) autoLink = try $ do @@ -1899,15 +1922,33 @@ autoLink = try $ do return $ return $ B.linkWith attr (src <> escapeURI extra) "" (B.str $ orig <> extra) +-- | Rebase a relative path, by adding the (relative) directory +-- of the containing source position. Absolute links and URLs +-- are untouched. +rebasePath :: SourcePos -> Text -> Text +rebasePath pos path = do + let fp = sourceName pos + isFragment = T.take 1 path == "#" + path' = T.unpack path + isAbsolutePath = Posix.isAbsolute path' || Windows.isAbsolute path' + in if T.null path || isFragment || isAbsolutePath || isURI path + then path + else + case takeDirectory fp of + "" -> path + "." -> path + d -> T.pack d <> "/" <> path + image :: PandocMonad m => MarkdownParser m (F Inlines) image = try $ do char '!' (lab,raw) <- reference defaultExt <- getOption readerDefaultImageExtension - let constructor attr' src = case takeExtension (T.unpack src) of - "" -> B.imageWith attr' (T.pack $ addExtension (T.unpack src) - $ T.unpack defaultExt) - _ -> B.imageWith attr' src + let constructor attr' src = + case takeExtension (T.unpack src) of + "" -> B.imageWith attr' (T.pack $ addExtension (T.unpack src) + $ T.unpack defaultExt) + _ -> B.imageWith attr' src regLink constructor lab <|> referenceLink constructor (lab,raw) note :: PandocMonad m => MarkdownParser m (F Inlines) @@ -1935,23 +1976,25 @@ note = try $ do return $ B.note $ walk adjustCite contents' inlineNote :: PandocMonad m => MarkdownParser m (F Inlines) -inlineNote = try $ do +inlineNote = do guardEnabled Ext_inline_notes - char '^' - updateState $ \st -> st{ stateInNote = True - , stateNoteNumber = stateNoteNumber st + 1 } - contents <- inlinesInBalancedBrackets - updateState $ \st -> st{ stateInNote = False } - return $ B.note . B.para <$> contents + try $ do + char '^' + updateState $ \st -> st{ stateInNote = True + , stateNoteNumber = stateNoteNumber st + 1 } + contents <- inlinesInBalancedBrackets + updateState $ \st -> st{ stateInNote = False } + return $ B.note . B.para <$> contents rawLaTeXInline' :: PandocMonad m => MarkdownParser m (F Inlines) -rawLaTeXInline' = try $ do +rawLaTeXInline' = do guardEnabled Ext_raw_tex notFollowedBy' rawConTeXtEnvironment - s <- rawLaTeXInline - return $ return $ B.rawInline "tex" s -- "tex" because it might be context + try $ do + s <- rawLaTeXInline + return $ return $ B.rawInline "tex" s -- "tex" because it might be context -rawConTeXtEnvironment :: PandocMonad m => ParserT Text st m Text +rawConTeXtEnvironment :: PandocMonad m => ParserT Sources st m Text rawConTeXtEnvironment = try $ do string "\\start" completion <- inBrackets (letter <|> digit <|> spaceChar) @@ -1960,7 +2003,7 @@ rawConTeXtEnvironment = try $ do (try $ string "\\stop" >> textStr completion) return $ "\\start" <> completion <> T.concat contents <> "\\stop" <> completion -inBrackets :: PandocMonad m => ParserT Text st m Char -> ParserT Text st m Text +inBrackets :: PandocMonad m => ParserT Sources st m Char -> ParserT Sources st m Text inBrackets parser = do char '[' contents <- manyChar parser @@ -1968,55 +2011,60 @@ inBrackets parser = do return $ "[" <> contents <> "]" spanHtml :: PandocMonad m => MarkdownParser m (F Inlines) -spanHtml = try $ do +spanHtml = do guardEnabled Ext_native_spans - (TagOpen _ attrs, _) <- htmlTag (~== TagOpen ("span" :: Text) []) - contents <- mconcat <$> manyTill inline (htmlTag (~== TagClose ("span" :: Text))) - let ident = fromMaybe "" $ lookup "id" attrs - let classes = maybe [] T.words $ lookup "class" attrs - let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] - return $ if isSmallCaps (ident, classes, keyvals) - then B.smallcaps <$> contents - else if isUnderline (ident, classes, keyvals) - then B.underline <$> contents - else B.spanWith (ident, classes, keyvals) <$> contents + try $ do + (TagOpen _ attrs, _) <- htmlTag (~== TagOpen ("span" :: Text) []) + contents <- mconcat <$> manyTill inline (htmlTag (~== TagClose ("span" :: Text))) + let ident = fromMaybe "" $ lookup "id" attrs + let classes = maybe [] T.words $ lookup "class" attrs + let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] + return $ if isSmallCaps (ident, classes, keyvals) + then B.smallcaps <$> contents + else if isUnderline (ident, classes, keyvals) + then B.underline <$> contents + else B.spanWith (ident, classes, keyvals) <$> contents divHtml :: PandocMonad m => MarkdownParser m (F Blocks) -divHtml = try $ do +divHtml = do guardEnabled Ext_native_divs - (TagOpen _ attrs, rawtag) <- htmlTag (~== TagOpen ("div" :: Text) []) - -- we set stateInHtmlBlock so that closing tags that can be either block or - -- inline will not be parsed as inline tags - oldInHtmlBlock <- stateInHtmlBlock <$> getState - updateState $ \st -> st{ stateInHtmlBlock = Just "div" } - bls <- option "" (blankline >> option "" blanklines) - contents <- mconcat <$> - many (notFollowedBy' (htmlTag (~== TagClose ("div" :: Text))) >> block) - closed <- option False (True <$ htmlTag (~== TagClose ("div" :: Text))) - if closed - then do - updateState $ \st -> st{ stateInHtmlBlock = oldInHtmlBlock } - let ident = fromMaybe "" $ lookup "id" attrs - let classes = maybe [] T.words $ lookup "class" attrs - let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] - return $ B.divWith (ident, classes, keyvals) <$> contents - else -- avoid backtracing - return $ return (B.rawBlock "html" (rawtag <> bls)) <> contents + try $ do + (TagOpen _ attrs, rawtag) <- htmlTag (~== TagOpen ("div" :: Text) []) + -- we set stateInHtmlBlock so that closing tags that can be either block + -- or inline will not be parsed as inline tags + oldInHtmlBlock <- stateInHtmlBlock <$> getState + updateState $ \st -> st{ stateInHtmlBlock = Just "div" } + bls <- option "" (blankline >> option "" blanklines) + contents <- mconcat <$> + many (notFollowedBy' (htmlTag (~== TagClose ("div" :: Text))) >> block) + closed <- option False (True <$ htmlTag (~== TagClose ("div" :: Text))) + if closed + then do + updateState $ \st -> st{ stateInHtmlBlock = oldInHtmlBlock } + let ident = fromMaybe "" $ lookup "id" attrs + let classes = maybe [] T.words $ lookup "class" attrs + let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] + return $ B.divWith (ident, classes, keyvals) <$> contents + else -- avoid backtracing + return $ return (B.rawBlock "html" (rawtag <> bls)) <> contents divFenced :: PandocMonad m => MarkdownParser m (F Blocks) -divFenced = try $ do +divFenced = do guardEnabled Ext_fenced_divs - string ":::" - skipMany (char ':') - skipMany spaceChar - attribs <- attributes <|> ((\x -> ("",[x],[])) <$> many1Char nonspaceChar) - skipMany spaceChar - skipMany (char ':') - blankline - updateState $ \st -> st{ stateFencedDivLevel = stateFencedDivLevel st + 1 } - bs <- mconcat <$> manyTill block divFenceEnd - updateState $ \st -> st{ stateFencedDivLevel = stateFencedDivLevel st - 1 } - return $ B.divWith attribs <$> bs + try $ do + string ":::" + skipMany (char ':') + skipMany spaceChar + attribs <- attributes <|> ((\x -> ("",[x],[])) <$> many1Char nonspaceChar) + skipMany spaceChar + skipMany (char ':') + blankline + updateState $ \st -> + st{ stateFencedDivLevel = stateFencedDivLevel st + 1 } + bs <- mconcat <$> manyTill block divFenceEnd + updateState $ \st -> + st{ stateFencedDivLevel = stateFencedDivLevel st - 1 } + return $ B.divWith attribs <$> bs divFenceEnd :: PandocMonad m => MarkdownParser m () divFenceEnd = try $ do @@ -2048,14 +2096,15 @@ emojiChars :: [Char] emojiChars = ['a'..'z'] ++ ['0'..'9'] ++ ['_','+','-'] emoji :: PandocMonad m => MarkdownParser m (F Inlines) -emoji = try $ do +emoji = do guardEnabled Ext_emoji - char ':' - emojikey <- many1Char (oneOf emojiChars) - char ':' - case emojiToInline emojikey of - Just i -> return (return $ B.singleton i) - Nothing -> mzero + try $ do + char ':' + emojikey <- many1Char (oneOf emojiChars) + char ':' + case emojiToInline emojikey of + Just i -> return (return $ B.singleton i) + Nothing -> mzero -- Citations @@ -2074,7 +2123,7 @@ cite = do textualCite :: PandocMonad m => MarkdownParser m (F Inlines) textualCite = try $ do - (suppressAuthor, key) <- citeKey + (suppressAuthor, key) <- citeKey True -- If this is a reference to an earlier example list item, -- then don't parse it as a citation. If the example list -- item comes later, we'll parse it here and figure out in @@ -2154,7 +2203,7 @@ prefix = trimInlinesF . mconcat <$> manyTill inline (char ']' <|> lookAhead (try $ do optional (try (char ';' >> spnl)) - citeKey + citeKey True return ']')) citeList :: PandocMonad m => MarkdownParser m (F [Citation]) @@ -2163,7 +2212,7 @@ citeList = fmap sequence $ sepBy1 citation (try $ char ';' >> spnl) citation :: PandocMonad m => MarkdownParser m (F Citation) citation = try $ do pref <- prefix - (suppress_author, key) <- citeKey + (suppress_author, key) <- citeKey True suff <- suffix noteNum <- stateNoteNumber <$> getState return $ do @@ -2182,28 +2231,30 @@ citation = try $ do smart :: PandocMonad m => MarkdownParser m (F Inlines) smart = do guardEnabled Ext_smart - doubleQuoted <|> singleQuoted <|> - choice (map (return <$>) [apostrophe, dash, ellipses]) + doubleQuoted <|> singleQuoted <|> (return <$> doubleCloseQuote) <|> + (return <$> apostrophe) <|> (return <$> dash) <|> (return <$> ellipses) singleQuoted :: PandocMonad m => MarkdownParser m (F Inlines) -singleQuoted = try $ do +singleQuoted = do singleQuoteStart - withQuoteContext InSingleQuote $ + (try (withQuoteContext InSingleQuote $ fmap B.singleQuoted . trimInlinesF . mconcat <$> - many1Till inline singleQuoteEnd + many1Till inline singleQuoteEnd)) + <|> (return (return (B.str "\8217"))) -- doubleQuoted will handle regular double-quoted sections, as well -- as dialogues with an open double-quote without a close double-quote -- in the same paragraph. doubleQuoted :: PandocMonad m => MarkdownParser m (F Inlines) -doubleQuoted = try $ do +doubleQuoted = do doubleQuoteStart - withQuoteContext InDoubleQuote $ + (try (withQuoteContext InDoubleQuote $ fmap B.doubleQuoted . trimInlinesF . mconcat <$> - many1Till inline doubleQuoteEnd + many1Till inline doubleQuoteEnd)) + <|> (return (return (B.str "\8220"))) toRow :: [Blocks] -> Row toRow = Row nullAttr . map B.simpleCell toHeaderRow :: [Blocks] -> [Row] -toHeaderRow l = [toRow l | not (null l)] +toHeaderRow l = [toRow l | not (null l) && not (all null l)] diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs index e8985ab2c..825e4a2eb 100644 --- a/src/Text/Pandoc/Readers/MediaWiki.hs +++ b/src/Text/Pandoc/Readers/MediaWiki.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.MediaWiki - Copyright : Copyright (C) 2012-2020 John MacFarlane + Copyright : Copyright (C) 2012-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -36,17 +36,18 @@ import Text.Pandoc.Logging import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (nested) import Text.Pandoc.Readers.HTML (htmlTag, isBlockTag, isCommentTag) -import Text.Pandoc.Shared (crFilter, safeRead, stringify, stripTrailingNewlines, +import Text.Pandoc.Shared (safeRead, stringify, stripTrailingNewlines, trim, splitTextBy, tshow) import Text.Pandoc.Walk (walk) import Text.Pandoc.XML (fromEntities) -- | Read mediawiki from an input string and return a Pandoc document. -readMediaWiki :: PandocMonad m - => ReaderOptions -- ^ Reader options - -> Text -- ^ String to parse (assuming @'\n'@ line endings) +readMediaWiki :: (PandocMonad m, ToSources a) + => ReaderOptions + -> a -> m Pandoc readMediaWiki opts s = do + let sources = toSources s parsed <- readWithM parseMediaWiki MWState{ mwOptions = opts , mwMaxNestingLevel = 4 , mwNextLinkNumber = 1 @@ -55,7 +56,7 @@ readMediaWiki opts s = do , mwLogMessages = [] , mwInTT = False } - (crFilter s <> "\n") + sources case parsed of Right result -> return result Left e -> throwError e @@ -69,7 +70,7 @@ data MWState = MWState { mwOptions :: ReaderOptions , mwInTT :: Bool } -type MWParser m = ParserT Text MWState m +type MWParser m = ParserT Sources MWState m instance HasReaderOptions MWState where extractReaderOptions = mwOptions @@ -112,12 +113,14 @@ newBlockTags = ["haskell","syntaxhighlight","source","gallery","references"] isBlockTag' :: Tag Text -> Bool isBlockTag' tag@(TagOpen t _) = (isBlockTag tag || t `elem` newBlockTags) && t `notElem` eitherBlockOrInline +isBlockTag' (TagClose "ref") = True -- needed so 'special' doesn't parse it isBlockTag' tag@(TagClose t) = (isBlockTag tag || t `elem` newBlockTags) && t `notElem` eitherBlockOrInline isBlockTag' tag = isBlockTag tag isInlineTag' :: Tag Text -> Bool isInlineTag' (TagComment _) = True +isInlineTag' (TagClose "ref") = False -- see below inlineTag isInlineTag' t = not (isBlockTag' t) eitherBlockOrInline :: [Text] @@ -554,11 +557,17 @@ variable = try $ do contents <- manyTillChar anyChar (try $ string "}}}") return $ "{{{" <> contents <> "}}}" +singleParaToPlain :: Blocks -> Blocks +singleParaToPlain bs = + case B.toList bs of + [Para ils] -> B.fromList [Plain ils] + _ -> bs + inlineTag :: PandocMonad m => MWParser m Inlines inlineTag = do (tag, _) <- lookAhead $ htmlTag isInlineTag' case tag of - TagOpen "ref" _ -> B.note . B.plain <$> inlinesInTags "ref" + TagOpen "ref" _ -> B.note . singleParaToPlain <$> blocksInTags "ref" TagOpen "nowiki" _ -> try $ do (_,raw) <- htmlTag (~== tag) if T.any (== '/') raw @@ -678,19 +687,17 @@ url = do -- | Parses a list of inlines between start and end delimiters. inlinesBetween :: (PandocMonad m, Show b) => MWParser m a -> MWParser m b -> MWParser m Inlines inlinesBetween start end = - trimInlines . mconcat <$> try (start >> many1Till inner end) - where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline) - innerSpace = try $ whitespace <* notFollowedBy' end + trimInlines . mconcat <$> try (start >> many1Till inline end) emph :: PandocMonad m => MWParser m Inlines emph = B.emph <$> nested (inlinesBetween start end) - where start = sym "''" >> lookAhead nonspaceChar + where start = sym "''" end = try $ notFollowedBy' (() <$ strong) >> sym "''" strong :: PandocMonad m => MWParser m Inlines strong = B.strong <$> nested (inlinesBetween start end) - where start = sym "'''" >> lookAhead nonspaceChar - end = try $ sym "'''" + where start = sym "'''" + end = sym "'''" doubleQuotes :: PandocMonad m => MWParser m Inlines doubleQuotes = do diff --git a/src/Text/Pandoc/Readers/Metadata.hs b/src/Text/Pandoc/Readers/Metadata.hs index a64b130e5..cbc523b25 100644 --- a/src/Text/Pandoc/Readers/Metadata.hs +++ b/src/Text/Pandoc/Readers/Metadata.hs @@ -2,7 +2,7 @@ {-# LANGUAGE ScopedTypeVariables #-} {- | Module : Text.Pandoc.Readers.Metadata - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -14,6 +14,7 @@ Parse YAML/JSON metadata to 'Pandoc' 'Meta'. module Text.Pandoc.Readers.Metadata ( yamlBsToMeta, yamlBsToRefs, + yamlMetaBlock, yamlMap ) where import Control.Monad @@ -30,11 +31,13 @@ import Text.Pandoc.Definition import Text.Pandoc.Error import Text.Pandoc.Parsing hiding (tableWith) import Text.Pandoc.Shared +import qualified Data.Text.Lazy as TL +import qualified Text.Pandoc.UTF8 as UTF8 -yamlBsToMeta :: PandocMonad m - => ParserT Text ParserState m (F MetaValue) +yamlBsToMeta :: (PandocMonad m, HasLastStrPosition st) + => ParserT Sources st m (Future st MetaValue) -> BL.ByteString - -> ParserT Text ParserState m (F Meta) + -> ParserT Sources st m (Future st Meta) yamlBsToMeta pMetaValue bstr = do case YAML.decodeNode' YAML.failsafeSchemaResolver False False bstr of Right (YAML.Doc (YAML.Mapping _ _ o):_) @@ -42,6 +45,9 @@ yamlBsToMeta pMetaValue bstr = do Right [] -> return . return $ mempty Right [YAML.Doc (YAML.Scalar _ YAML.SNull)] -> return . return $ mempty + -- the following is what we get from a comment: + Right [YAML.Doc (YAML.Scalar _ (YAML.SUnknown _ ""))] + -> return . return $ mempty Right _ -> Prelude.fail "expected YAML object" Left (yamlpos, err') -> do pos <- getPosition @@ -63,11 +69,11 @@ lookupYAML t (YAML.Mapping _ _ m) = lookupYAML _ _ = Nothing -- Returns filtered list of references. -yamlBsToRefs :: PandocMonad m - => ParserT Text ParserState m (F MetaValue) +yamlBsToRefs :: (PandocMonad m, HasLastStrPosition st) + => ParserT Sources st m (Future st MetaValue) -> (Text -> Bool) -- ^ Filter for id -> BL.ByteString - -> ParserT Text ParserState m (F [MetaValue]) + -> ParserT Sources st m (Future st [MetaValue]) yamlBsToRefs pMetaValue idpred bstr = case YAML.decodeNode' YAML.failsafeSchemaResolver False False bstr of Right (YAML.Doc o@YAML.Mapping{}:_) @@ -95,8 +101,12 @@ yamlBsToRefs pMetaValue idpred bstr = Right [YAML.Doc (YAML.Scalar _ YAML.SNull)] -> return . return $ mempty Right _ -> Prelude.fail "expecting YAML object" - Left (_pos, err') - -> Prelude.fail err' + Left (yamlpos, err') + -> do pos <- getPosition + setPosition $ incSourceLine + (setSourceColumn pos (YE.posColumn yamlpos)) + (YE.posLine yamlpos - 1) + Prelude.fail err' nodeToKey :: YAML.Node YE.Pos -> Maybe Text @@ -104,10 +114,10 @@ nodeToKey (YAML.Scalar _ (YAML.SStr t)) = Just t nodeToKey (YAML.Scalar _ (YAML.SUnknown _ t)) = Just t nodeToKey _ = Nothing -normalizeMetaValue :: PandocMonad m - => ParserT Text ParserState m (F MetaValue) +normalizeMetaValue :: (PandocMonad m, HasLastStrPosition st) + => ParserT Sources st m (Future st MetaValue) -> Text - -> ParserT Text ParserState m (F MetaValue) + -> ParserT Sources st m (Future st MetaValue) normalizeMetaValue pMetaValue x = -- Note: a standard quoted or unquoted YAML value will -- not end in a newline, but a "block" set off with @@ -129,10 +139,10 @@ checkBoolean t | t == T.pack "false" || t == T.pack "False" || t == T.pack "FALSE" = Just False | otherwise = Nothing -yamlToMetaValue :: PandocMonad m - => ParserT Text ParserState m (F MetaValue) +yamlToMetaValue :: (PandocMonad m, HasLastStrPosition st) + => ParserT Sources st m (Future st MetaValue) -> YAML.Node YE.Pos - -> ParserT Text ParserState m (F MetaValue) + -> ParserT Sources st m (Future st MetaValue) yamlToMetaValue pMetaValue (YAML.Scalar _ x) = case x of YAML.SStr t -> normalizeMetaValue pMetaValue t @@ -152,10 +162,10 @@ yamlToMetaValue pMetaValue (YAML.Mapping _ _ o) = fmap MetaMap <$> yamlMap pMetaValue o yamlToMetaValue _ _ = return $ return $ MetaString "" -yamlMap :: PandocMonad m - => ParserT Text ParserState m (F MetaValue) +yamlMap :: (PandocMonad m, HasLastStrPosition st) + => ParserT Sources st m (Future st MetaValue) -> M.Map (YAML.Node YE.Pos) (YAML.Node YE.Pos) - -> ParserT Text ParserState m (F (M.Map Text MetaValue)) + -> ParserT Sources st m (Future st (M.Map Text MetaValue)) yamlMap pMetaValue o = do kvs <- forM (M.toList o) $ \(key, v) -> do k <- maybe (throwError $ PandocParseError @@ -171,3 +181,20 @@ yamlMap pMetaValue o = do return $ do v' <- fv return (k, v') + +-- | Parse a YAML metadata block using the supplied 'MetaValue' parser. +yamlMetaBlock :: (HasLastStrPosition st, PandocMonad m) + => ParserT Sources st m (Future st MetaValue) + -> ParserT Sources st m (Future st Meta) +yamlMetaBlock parser = try $ do + string "---" + blankline + notFollowedBy blankline -- if --- is followed by a blank it's an HRULE + rawYamlLines <- manyTill anyLine stopLine + -- by including --- and ..., we allow yaml blocks with just comments: + let rawYaml = T.unlines ("---" : (rawYamlLines ++ ["..."])) + optional blanklines + yamlBsToMeta parser $ UTF8.fromTextLazy $ TL.fromStrict rawYaml + +stopLine :: Monad m => ParserT Sources st m () +stopLine = try $ (string "---" <|> string "...") >> blankline >> return () diff --git a/src/Text/Pandoc/Readers/Muse.hs b/src/Text/Pandoc/Readers/Muse.hs index b4eea9d3a..a0d4534f1 100644 --- a/src/Text/Pandoc/Readers/Muse.hs +++ b/src/Text/Pandoc/Readers/Muse.hs @@ -36,19 +36,20 @@ import Text.Pandoc.Definition import Text.Pandoc.Error (PandocError (PandocParsecError)) import Text.Pandoc.Logging import Text.Pandoc.Options -import Text.Pandoc.Parsing hiding (F) -import Text.Pandoc.Shared (crFilter, trimr, tshow) +import Text.Pandoc.Parsing +import Text.Pandoc.Shared (trimr, tshow) -- | Read Muse from an input string and return a Pandoc document. -readMuse :: PandocMonad m +readMuse :: (PandocMonad m, ToSources a) => ReaderOptions - -> Text + -> a -> m Pandoc readMuse opts s = do - let input = crFilter s - res <- flip runReaderT def $ runParserT parseMuse def{ museOptions = opts } "source" input + let sources = toSources s + res <- flip runReaderT def $ runParserT parseMuse def{ museOptions = opts } + (initialSourceName sources) sources case res of - Left e -> throwError $ PandocParsecError input e + Left e -> throwError $ PandocParsecError sources e Right d -> return d type F = Future MuseState @@ -82,7 +83,7 @@ instance Default MuseEnv where , museInPara = False } -type MuseParser m = ParserT Text MuseState (ReaderT MuseEnv m) +type MuseParser m = ParserT Sources MuseState (ReaderT MuseEnv m) instance HasReaderOptions MuseState where extractReaderOptions = museOptions @@ -155,7 +156,7 @@ firstColumn = getPosition >>= \pos -> guard (sourceColumn pos == 1) -- * Parsers -- | Parse end-of-line, which can be either a newline or end-of-file. -eol :: Stream s m Char => ParserT s st m () +eol :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m () eol = void newline <|> eof getIndent :: PandocMonad m diff --git a/src/Text/Pandoc/Readers/Native.hs b/src/Text/Pandoc/Readers/Native.hs index 42843381a..58f235e81 100644 --- a/src/Text/Pandoc/Readers/Native.hs +++ b/src/Text/Pandoc/Readers/Native.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Native - Copyright : Copyright (C) 2011-2020 John MacFarlane + Copyright : Copyright (C) 2011-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -21,6 +21,7 @@ import Control.Monad.Except (throwError) import Data.Text (Text) import Text.Pandoc.Class.PandocMonad (PandocMonad) import Text.Pandoc.Error +import Text.Pandoc.Sources (ToSources(..), sourcesToText) -- | Read native formatted text and return a Pandoc document. -- The input may be a full pandoc document, a block list, a block, @@ -32,14 +33,15 @@ import Text.Pandoc.Error -- -- > Pandoc nullMeta [Plain [Str "hi"]] -- -readNative :: PandocMonad m +readNative :: (PandocMonad m, ToSources a) => ReaderOptions - -> Text -- ^ String to parse (assuming @'\n'@ line endings) + -> a -> m Pandoc readNative _ s = - case maybe (Pandoc nullMeta <$> readBlocks s) Right (safeRead s) of - Right doc -> return doc - Left _ -> throwError $ PandocParseError "couldn't read native" + let t = sourcesToText . toSources $ s + in case maybe (Pandoc nullMeta <$> readBlocks t) Right (safeRead t) of + Right doc -> return doc + Left _ -> throwError $ PandocParseError "couldn't read native" readBlocks :: Text -> Either PandocError [Block] readBlocks s = maybe ((:[]) <$> readBlock s) Right (safeRead s) diff --git a/src/Text/Pandoc/Readers/OPML.hs b/src/Text/Pandoc/Readers/OPML.hs index 903cdf4a1..668c9ca11 100644 --- a/src/Text/Pandoc/Readers/OPML.hs +++ b/src/Text/Pandoc/Readers/OPML.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.OPML - Copyright : Copyright (C) 2013-2020 John MacFarlane + Copyright : Copyright (C) 2013-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -13,20 +13,21 @@ Conversion of OPML to 'Pandoc' document. module Text.Pandoc.Readers.OPML ( readOPML ) where import Control.Monad.State.Strict -import Data.Char (toUpper) import Data.Default -import Data.Generics import Data.Maybe (fromMaybe) import Data.Text (Text) import qualified Data.Text as T -import Text.HTML.TagSoup.Entity (lookupEntity) +import qualified Data.Text.Lazy as TL import Text.Pandoc.Builder import Text.Pandoc.Class.PandocMonad (PandocMonad) import Text.Pandoc.Options +import Text.Pandoc.Error (PandocError(..)) import Text.Pandoc.Readers.HTML (readHtml) import Text.Pandoc.Readers.Markdown (readMarkdown) -import Text.Pandoc.Shared (crFilter, blocksToInlines') -import Text.XML.Light +import Text.Pandoc.Shared (blocksToInlines') +import Text.Pandoc.Sources (ToSources(..), sourcesToText) +import Text.Pandoc.XML.Light +import Control.Monad.Except (throwError) type OPML m = StateT OPMLState m @@ -46,42 +47,27 @@ instance Default OPMLState where , opmlOptions = def } -readOPML :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readOPML :: (PandocMonad m, ToSources a) + => ReaderOptions + -> a + -> m Pandoc readOPML opts inp = do - (bs, st') <- runStateT - (mapM parseBlock $ normalizeTree $ - parseXML (T.unpack (crFilter inp))) def{ opmlOptions = opts } + let sources = toSources inp + (bs, st') <- + runStateT (case parseXMLContents (TL.fromStrict . sourcesToText $ sources) of + Left msg -> throwError $ PandocXMLError "" msg + Right ns -> mapM parseBlock ns) + def{ opmlOptions = opts } return $ setTitle (opmlDocTitle st') $ setAuthors (opmlDocAuthors st') $ setDate (opmlDocDate st') $ doc $ mconcat bs --- normalize input, consolidating adjacent Text and CRef elements -normalizeTree :: [Content] -> [Content] -normalizeTree = everywhere (mkT go) - where go :: [Content] -> [Content] - go (Text (CData CDataRaw _ _):xs) = xs - go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = - Text (CData CDataText (s1 ++ s2) z):xs - go (Text (CData CDataText s1 z):CRef r:xs) = - Text (CData CDataText (s1 ++ convertEntity r) z):xs - go (CRef r:Text (CData CDataText s1 z):xs) = - Text (CData CDataText (convertEntity r ++ s1) z):xs - go (CRef r1:CRef r2:xs) = - Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs - go xs = xs - -convertEntity :: String -> String -convertEntity e = Data.Maybe.fromMaybe (map toUpper e) (lookupEntity e) - -- convenience function to get an attribute value, defaulting to "" -attrValue :: String -> Element -> Text +attrValue :: Text -> Element -> Text attrValue attr elt = - maybe "" T.pack (lookupAttrBy (\x -> qName x == attr) (elAttribs elt)) - -textContent :: Element -> Text -textContent = T.pack . strContent + fromMaybe "" (lookupAttrBy (\x -> qName x == attr) (elAttribs elt)) -- exceptT :: PandocMonad m => Either PandocError a -> OPML m a -- exceptT = either throwError return @@ -105,11 +91,11 @@ parseBlock :: PandocMonad m => Content -> OPML m Blocks parseBlock (Elem e) = case qName (elName e) of "ownerName" -> mempty <$ modify (\st -> - st{opmlDocAuthors = [text $ textContent e]}) + st{opmlDocAuthors = [text $ strContent e]}) "dateModified" -> mempty <$ modify (\st -> - st{opmlDocDate = text $ textContent e}) + st{opmlDocDate = text $ strContent e}) "title" -> mempty <$ modify (\st -> - st{opmlDocTitle = text $ textContent e}) + st{opmlDocTitle = text $ strContent e}) "outline" -> gets opmlSectionLevel >>= sect . (+1) "?xml" -> return mempty _ -> getBlocks e diff --git a/src/Text/Pandoc/Readers/Odt.hs b/src/Text/Pandoc/Readers/Odt.hs index 7b8bfd4b5..c274b6fd4 100644 --- a/src/Text/Pandoc/Readers/Odt.hs +++ b/src/Text/Pandoc/Readers/Odt.hs @@ -1,4 +1,3 @@ -{-# LANGUAGE PatternGuards #-} {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Reader.Odt @@ -15,7 +14,7 @@ Entry point to the odt reader. module Text.Pandoc.Readers.Odt ( readOdt ) where import Codec.Archive.Zip -import qualified Text.XML.Light as XML +import Text.Pandoc.XML.Light import qualified Data.ByteString.Lazy as B @@ -23,6 +22,8 @@ import System.FilePath import Control.Monad.Except (throwError) +import qualified Data.Text as T + import Text.Pandoc.Class.PandocMonad (PandocMonad) import qualified Text.Pandoc.Class.PandocMonad as P import Text.Pandoc.Definition @@ -60,35 +61,37 @@ readOdt' _ bytes = bytesToOdt bytes-- of bytesToOdt :: B.ByteString -> Either PandocError (Pandoc, MediaBag) bytesToOdt bytes = case toArchiveOrFail bytes of Right archive -> archiveToOdt archive - Left _ -> Left $ PandocParseError "Couldn't parse odt file." + Left err -> Left $ PandocParseError + $ "Could not unzip ODT: " <> T.pack err -- archiveToOdt :: Archive -> Either PandocError (Pandoc, MediaBag) -archiveToOdt archive - | Just contentEntry <- findEntryByPath "content.xml" archive - , Just stylesEntry <- findEntryByPath "styles.xml" archive - , Just contentElem <- entryToXmlElem contentEntry - , Just stylesElem <- entryToXmlElem stylesEntry - , Right styles <- chooseMax (readStylesAt stylesElem ) - (readStylesAt contentElem) - , media <- filteredFilesFromArchive archive filePathIsOdtMedia - , startState <- readerState styles media - , Right pandocWithMedia <- runConverter' read_body - startState - contentElem - - = Right pandocWithMedia - - | otherwise - -- Not very detailed, but I don't think more information would be helpful - = Left $ PandocParseError "Couldn't parse odt file." - where - filePathIsOdtMedia :: FilePath -> Bool +archiveToOdt archive = do + let onFailure msg Nothing = Left $ PandocParseError msg + onFailure _ (Just x) = Right x + contentEntry <- onFailure "Could not find content.xml" + (findEntryByPath "content.xml" archive) + stylesEntry <- onFailure "Could not find styles.xml" + (findEntryByPath "styles.xml" archive) + contentElem <- entryToXmlElem contentEntry + stylesElem <- entryToXmlElem stylesEntry + styles <- either + (\_ -> Left $ PandocParseError "Could not read styles") + Right + (chooseMax (readStylesAt stylesElem ) (readStylesAt contentElem)) + let filePathIsOdtMedia :: FilePath -> Bool filePathIsOdtMedia fp = let (dir, name) = splitFileName fp in (dir == "Pictures/") || (dir /= "./" && name == "content.xml") + let media = filteredFilesFromArchive archive filePathIsOdtMedia + let startState = readerState styles media + either (\_ -> Left $ PandocParseError "Could not convert opendocument") Right + (runConverter' read_body startState contentElem) -- -entryToXmlElem :: Entry -> Maybe XML.Element -entryToXmlElem = XML.parseXMLDoc . UTF8.toStringLazy . fromEntry +entryToXmlElem :: Entry -> Either PandocError Element +entryToXmlElem entry = + case parseXMLElement . UTF8.toTextLazy . fromEntry $ entry of + Right x -> Right x + Left msg -> Left $ PandocXMLError (T.pack $ eRelativePath entry) msg diff --git a/src/Text/Pandoc/Readers/Odt/Arrows/State.hs b/src/Text/Pandoc/Readers/Odt/Arrows/State.hs index 93c6b5e79..96515bf56 100644 --- a/src/Text/Pandoc/Readers/Odt/Arrows/State.hs +++ b/src/Text/Pandoc/Readers/Odt/Arrows/State.hs @@ -22,7 +22,7 @@ module Text.Pandoc.Readers.Odt.Arrows.State where import Control.Arrow import qualified Control.Category as Cat import Control.Monad - +import Data.List (foldl') import Text.Pandoc.Readers.Odt.Arrows.Utils import Text.Pandoc.Readers.Odt.Generic.Fallible @@ -122,7 +122,7 @@ iterateS a = ArrowState $ \(s,f) -> foldr a' (s,mzero) f iterateSL :: (Foldable f, MonadPlus m) => ArrowState s x y -> ArrowState s (f x) (m y) -iterateSL a = ArrowState $ \(s,f) -> foldl a' (s,mzero) f +iterateSL a = ArrowState $ \(s,f) -> foldl' a' (s,mzero) f where a' (s',m) x = second (mplus m.return) $ runArrowState a (s',x) diff --git a/src/Text/Pandoc/Readers/Odt/ContentReader.hs b/src/Text/Pandoc/Readers/Odt/ContentReader.hs index 43c44e7e9..5520d039f 100644 --- a/src/Text/Pandoc/Readers/Odt/ContentReader.hs +++ b/src/Text/Pandoc/Readers/Odt/ContentReader.hs @@ -1,4 +1,5 @@ {-# LANGUAGE Arrows #-} +{-# LANGUAGE CPP #-} {-# LANGUAGE DeriveFoldable #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE PatternGuards #-} @@ -29,14 +30,14 @@ import Control.Monad ((<=<)) import qualified Data.ByteString.Lazy as B import Data.Foldable (fold) -import Data.List (find, stripPrefix) +import Data.List (find) import qualified Data.Map as M import qualified Data.Text as T import Data.Maybe -import Data.Semigroup (First(..), Option(..)) +import Data.Monoid (Alt (..)) import Text.TeXMath (readMathML, writeTeX) -import qualified Text.XML.Light as XML +import qualified Text.Pandoc.XML.Light as XML import Text.Pandoc.Builder hiding (underline) import Text.Pandoc.MediaBag (MediaBag, insertMedia) @@ -505,13 +506,11 @@ type InlineMatcher = ElementMatcher Inlines type BlockMatcher = ElementMatcher Blocks - -newtype FirstMatch a = FirstMatch (Option (First a)) - deriving (Foldable, Monoid, Semigroup) +newtype FirstMatch a = FirstMatch (Alt Maybe a) + deriving (Foldable, Monoid, Semigroup) firstMatch :: a -> FirstMatch a -firstMatch = FirstMatch . Option . Just . First - +firstMatch = FirstMatch . Alt . Just -- matchingElement :: (Monoid e) @@ -557,7 +556,7 @@ read_plain_text = fst ^&&& read_plain_text' >>% recover >>?% mappend -- extractText :: XML.Content -> Fallible T.Text - extractText (XML.Text cData) = succeedWith (T.pack $ XML.cdData cData) + extractText (XML.Text cData) = succeedWith (XML.cdData cData) extractText _ = failEmpty read_text_seq :: InlineMatcher @@ -577,7 +576,10 @@ read_spaces = matchingElement NsText "s" ( read_line_break :: InlineMatcher read_line_break = matchingElement NsText "line-break" $ returnV linebreak - +-- +read_tab :: InlineMatcher +read_tab = matchingElement NsText "tab" + $ returnV space -- read_span :: InlineMatcher read_span = matchingElement NsText "span" @@ -585,6 +587,7 @@ read_span = matchingElement NsText "span" $ matchChildContent [ read_span , read_spaces , read_line_break + , read_tab , read_link , read_note , read_citation @@ -604,6 +607,7 @@ read_paragraph = matchingElement NsText "p" $ matchChildContent [ read_span , read_spaces , read_line_break + , read_tab , read_link , read_note , read_citation @@ -630,6 +634,7 @@ read_header = matchingElement NsText "h" children <- ( matchChildContent [ read_span , read_spaces , read_line_break + , read_tab , read_link , read_note , read_citation @@ -777,14 +782,14 @@ read_frame_img = "" -> returnV mempty -< () src' -> do let exts = extensionsFromList [Ext_auto_identifiers] - resource <- lookupResource -< src' + resource <- lookupResource -< T.unpack src' _ <- updateMediaWithResource -< resource w <- findAttrText' NsSVG "width" -< () h <- findAttrText' NsSVG "height" -< () titleNodes <- matchChildContent' [ read_frame_title ] -< () alt <- matchChildContent [] read_plain_text -< () arr (firstMatch . uncurry4 imageWith) -< - (image_attributes w h, T.pack src', inlineListToIdentifier exts (toList titleNodes), alt) + (image_attributes w h, src', inlineListToIdentifier exts (toList titleNodes), alt) read_frame_title :: InlineMatcher read_frame_title = matchingElement NsSVG "title" (matchChildContent [] read_plain_text) @@ -804,7 +809,8 @@ read_frame_mathml = case fold src of "" -> returnV mempty -< () src' -> do - let path = fromMaybe src' (stripPrefix "./" src') ++ "/content.xml" + let path = T.unpack $ + fromMaybe src' (T.stripPrefix "./" src') <> "/content.xml" (_, mathml) <- lookupResource -< path case readMathML (UTF8.toText $ B.toStrict mathml) of Left _ -> returnV mempty -< () diff --git a/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs b/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs index 77174c793..78a7fc0b2 100644 --- a/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs +++ b/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs @@ -14,9 +14,10 @@ typesafe Haskell namespace identifiers and unsafe "real world" namespaces. module Text.Pandoc.Readers.Odt.Generic.Namespaces where import qualified Data.Map as M +import Data.Text (Text) -- -type NameSpaceIRI = String +type NameSpaceIRI = Text -- type NameSpaceIRIs nsID = M.Map nsID NameSpaceIRI diff --git a/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs b/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs index 6dc56a0d9..edefe3c70 100644 --- a/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs +++ b/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs @@ -20,7 +20,6 @@ module Text.Pandoc.Readers.Odt.Generic.Utils , reverseComposition , tryToRead , Lookupable(..) -, readLookupables , readLookupable , readPercent , findBy @@ -30,11 +29,11 @@ module Text.Pandoc.Readers.Odt.Generic.Utils import Control.Category (Category, (<<<), (>>>)) import qualified Control.Category as Cat (id) -import Control.Monad (msum) - +import Data.Char (isSpace) import qualified Data.Foldable as F (Foldable, foldr) import Data.Maybe - +import Data.Text (Text) +import qualified Data.Text as T -- | Equivalent to -- > foldr (.) id @@ -76,8 +75,8 @@ swing = flip.(.flip id) -- (nobody wants that) while the latter returns "to much" for simple purposes. -- This function instead applies 'reads' and returns the first match (if any) -- in a 'Maybe'. -tryToRead :: (Read r) => String -> Maybe r -tryToRead = reads >>> listToMaybe >>> fmap fst +tryToRead :: (Read r) => Text -> Maybe r +tryToRead = (reads . T.unpack) >>> listToMaybe >>> fmap fst -- | A version of 'reads' that requires a '%' sign after the number readPercent :: ReadS Int @@ -88,26 +87,12 @@ readPercent s = [ (i,s') | (i , r ) <- reads s -- | Data that can be looked up. -- This is mostly a utility to read data with kind *. class Lookupable a where - lookupTable :: [(String, a)] - --- | The idea is to use this function as if there was a declaration like --- --- > instance (Lookupable a) => (Read a) where --- > readsPrec _ = readLookupables --- . --- But including this code in this form would need UndecideableInstances. --- That is a bad idea. Luckily 'readLookupable' (without the s at the end) --- can be used directly in almost any case. -readLookupables :: (Lookupable a) => String -> [(a,String)] -readLookupables s = [ (a,rest) | (word,rest) <- lex s, - a <- maybeToList (lookup word lookupTable) - ] + lookupTable :: [(Text, a)] -- | Very similar to a simple 'lookup' in the 'lookupTable', but with a lexer. -readLookupable :: (Lookupable a) => String -> Maybe a -readLookupable s = msum - $ map ((`lookup` lookupTable).fst) - $ lex s +readLookupable :: (Lookupable a) => Text -> Maybe a +readLookupable s = + lookup (T.takeWhile (not . isSpace) $ T.dropWhile isSpace s) lookupTable uncurry3 :: (a->b->c -> z) -> (a,b,c ) -> z uncurry4 :: (a->b->c->d -> z) -> (a,b,c,d ) -> z diff --git a/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs index 00c636a0d..341903046 100644 --- a/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs +++ b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TupleSections #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE LambdaCase #-} @@ -60,15 +61,15 @@ import Control.Arrow import Data.Bool ( bool ) import Data.Either ( rights ) import qualified Data.Map as M -import qualified Data.Text as T +import Data.Text (Text) import Data.Default import Data.Maybe +import Data.List (foldl') -import qualified Text.XML.Light as XML +import qualified Text.Pandoc.XML.Light as XML import Text.Pandoc.Readers.Odt.Arrows.State import Text.Pandoc.Readers.Odt.Arrows.Utils - import Text.Pandoc.Readers.Odt.Generic.Namespaces import Text.Pandoc.Readers.Odt.Generic.Utils import Text.Pandoc.Readers.Odt.Generic.Fallible @@ -78,13 +79,13 @@ import Text.Pandoc.Readers.Odt.Generic.Fallible -------------------------------------------------------------------------------- -- -type ElementName = String -type AttributeName = String -type AttributeValue = String -type TextAttributeValue = T.Text +type ElementName = Text +type AttributeName = Text +type AttributeValue = Text +type TextAttributeValue = Text -- -type NameSpacePrefix = String +type NameSpacePrefix = Text -- type NameSpacePrefixes nsID = M.Map nsID NameSpacePrefix @@ -292,7 +293,7 @@ readNSattributes = fromState $ \state -> maybe (state, failEmpty ) => XMLConverterState nsID extraState -> Maybe (XMLConverterState nsID extraState) extractNSAttrs startState - = foldl (\state d -> state >>= addNS d) + = foldl' (\state d -> state >>= addNS d) (Just startState) nsAttribs where nsAttribs = mapMaybe readNSattr (XML.elAttribs element) @@ -461,7 +462,7 @@ lookupDefaultingAttr :: (NameSpaceID nsID, Lookupable a, Default a) lookupDefaultingAttr nsID attrName = lookupAttrWithDefault nsID attrName def --- | Return value as a (Maybe String) +-- | Return value as a (Maybe Text) findAttr' :: (NameSpaceID nsID) => nsID -> AttributeName -> XMLConverter nsID extraState x (Maybe AttributeValue) @@ -477,7 +478,6 @@ findAttrText' nsID attrName = qualifyName nsID attrName &&& getCurrentElement >>% XML.findAttr - >>^ fmap T.pack -- | Return value as string or fail findAttr :: (NameSpaceID nsID) @@ -492,7 +492,6 @@ findAttrText :: (NameSpaceID nsID) -> FallibleXMLConverter nsID extraState x TextAttributeValue findAttrText nsID attrName = findAttr' nsID attrName - >>^ fmap T.pack >>> maybeToChoice -- | Return value as string or return provided default value @@ -511,7 +510,7 @@ findAttrTextWithDefault :: (NameSpaceID nsID) -> XMLConverter nsID extraState x TextAttributeValue findAttrTextWithDefault nsID attrName deflt = findAttr' nsID attrName - >>^ maybe deflt T.pack + >>^ fromMaybe deflt -- | Read and return value or fail readAttr :: (NameSpaceID nsID, Read attrValue) @@ -748,7 +747,7 @@ matchContent lookups fallback -- Internals -------------------------------------------------------------------------------- -stringToBool' :: String -> Maybe Bool +stringToBool' :: Text -> Maybe Bool stringToBool' val | val `elem` trueValues = Just True | val `elem` falseValues = Just False | otherwise = Nothing diff --git a/src/Text/Pandoc/Readers/Odt/Namespaces.hs b/src/Text/Pandoc/Readers/Odt/Namespaces.hs index 3a24a1162..70741c28d 100644 --- a/src/Text/Pandoc/Readers/Odt/Namespaces.hs +++ b/src/Text/Pandoc/Readers/Odt/Namespaces.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Reader.Odt.Namespaces Copyright : Copyright (C) 2015 Martin Linnemann @@ -13,10 +14,10 @@ Namespaces used in odt files. module Text.Pandoc.Readers.Odt.Namespaces ( Namespace (..) ) where -import Data.List (isPrefixOf) import qualified Data.Map as M (empty, insert) import Data.Maybe (fromMaybe, listToMaybe) - +import Data.Text (Text) +import qualified Data.Text as T import Text.Pandoc.Readers.Odt.Generic.Namespaces @@ -30,7 +31,7 @@ instance NameSpaceID Namespace where findID :: NameSpaceIRI -> Maybe Namespace -findID iri = listToMaybe [nsID | (iri',nsID) <- nsIDs, iri' `isPrefixOf` iri] +findID iri = listToMaybe [nsID | (iri',nsID) <- nsIDs, iri' `T.isPrefixOf` iri] nsIDmap :: NameSpaceIRIs Namespace nsIDmap = foldr (uncurry $ flip M.insert) M.empty nsIDs @@ -54,12 +55,12 @@ data Namespace = -- Open Document core -- Core XML (basically only for the 'id'-attribute) | NsXML -- Fallback - | NsOther String + | NsOther Text deriving ( Eq, Ord, Show ) -- | Not the actual iri's, but large prefixes of them - this way there are -- less versioning problems and the like. -nsIDs :: [(String,Namespace)] +nsIDs :: [(Text, Namespace)] nsIDs = [ ("urn:oasis:names:tc:opendocument:xmlns:animation" , NsAnim ), ("urn:oasis:names:tc:opendocument:xmlns:chart" , NsChart ), diff --git a/src/Text/Pandoc/Readers/Odt/StyleReader.hs b/src/Text/Pandoc/Readers/Odt/StyleReader.hs index 46a777df1..ca791ad1e 100644 --- a/src/Text/Pandoc/Readers/Odt/StyleReader.hs +++ b/src/Text/Pandoc/Readers/Odt/StyleReader.hs @@ -2,6 +2,7 @@ {-# LANGUAGE Arrows #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE TupleSections #-} +{-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Odt.StyleReader Copyright : Copyright (C) 2015 Martin Linnemann @@ -43,14 +44,16 @@ import Control.Arrow import Data.Default import qualified Data.Foldable as F -import Data.List (unfoldr) +import Data.List (unfoldr, foldl') import qualified Data.Map as M import Data.Maybe +import Data.Text (Text) +import qualified Data.Text as T import qualified Data.Set as S -import qualified Text.XML.Light as XML +import qualified Text.Pandoc.XML.Light as XML -import Text.Pandoc.Shared (safeRead) +import Text.Pandoc.Shared (safeRead, tshow) import Text.Pandoc.Readers.Odt.Arrows.Utils @@ -90,7 +93,7 @@ instance Default FontPitch where -- -- Thus, we want -type FontFaceName = String +type FontFaceName = Text type FontPitches = M.Map FontFaceName FontPitch @@ -117,7 +120,7 @@ fontPitchReader = executeInSub NsOffice "font-face-decls" ( &&& lookupDefaultingAttr NsStyle "font-pitch" )) - >>?^ ( M.fromList . foldl accumLegalPitches [] ) + >>?^ ( M.fromList . foldl' accumLegalPitches [] ) ) `ifFailedDo` returnV (Right M.empty) where accumLegalPitches ls (Nothing,_) = ls accumLegalPitches ls (Just n,p) = (n,p):ls @@ -151,7 +154,7 @@ findPitch = ( lookupAttr NsStyle "font-pitch" -- Definitions of main data -------------------------------------------------------------------------------- -type StyleName = String +type StyleName = Text -- | There are two types of styles: named styles with a style family and an -- optional style parent, and default styles for each style family, @@ -355,8 +358,8 @@ getListLevelStyle level ListStyle{..} = -- \^ simpler, but in general less efficient data ListLevelStyle = ListLevelStyle { listLevelType :: ListLevelType - , listItemPrefix :: Maybe String - , listItemSuffix :: Maybe String + , listItemPrefix :: Maybe Text + , listItemSuffix :: Maybe Text , listItemFormat :: ListItemNumberFormat , listItemStart :: Int } @@ -366,9 +369,9 @@ instance Show ListLevelStyle where show ListLevelStyle{..} = "<LLS|" ++ show listLevelType ++ "|" - ++ maybeToString listItemPrefix + ++ maybeToString (T.unpack <$> listItemPrefix) ++ show listItemFormat - ++ maybeToString listItemSuffix + ++ maybeToString (T.unpack <$> listItemSuffix) ++ ">" where maybeToString = fromMaybe "" @@ -471,7 +474,7 @@ readTextProperties = ) where isFontEmphasised = [("normal",False),("italic",True),("oblique",True)] isFontBold = ("normal",False):("bold",True) - :map ((,True).show) ([100,200..900]::[Int]) + :map ((,True) . tshow) ([100,200..900]::[Int]) readUnderlineMode :: StyleReaderSafe _x (Maybe UnderlineMode) readUnderlineMode = readLineMode "text-underline-mode" @@ -481,7 +484,7 @@ readStrikeThroughMode :: StyleReaderSafe _x (Maybe UnderlineMode) readStrikeThroughMode = readLineMode "text-line-through-mode" "text-line-through-style" -readLineMode :: String -> String -> StyleReaderSafe _x (Maybe UnderlineMode) +readLineMode :: Text -> Text -> StyleReaderSafe _x (Maybe UnderlineMode) readLineMode modeAttr styleAttr = proc x -> do isUL <- searchAttr NsStyle styleAttr False isLinePresent -< x mode <- lookupAttr' NsStyle modeAttr -< x @@ -560,12 +563,13 @@ readListLevelStyle levelType = readAttr NsText "level" -- chooseMostSpecificListLevelStyle :: S.Set ListLevelStyle -> Maybe ListLevelStyle -chooseMostSpecificListLevelStyle ls | ls == mempty = Nothing - | otherwise = Just ( F.foldr1 select ls ) +chooseMostSpecificListLevelStyle ls = F.foldr select Nothing ls where - select ( ListLevelStyle t1 p1 s1 f1 b1 ) - ( ListLevelStyle t2 p2 s2 f2 _ ) - = ListLevelStyle (select' t1 t2) (p1 <|> p2) (s1 <|> s2) (selectLinf f1 f2) b1 + select l Nothing = Just l + select ( ListLevelStyle t1 p1 s1 f1 b1 ) + ( Just ( ListLevelStyle t2 p2 s2 f2 _ )) + = Just $ ListLevelStyle (select' t1 t2) (p1 <|> p2) (s1 <|> s2) + (selectLinf f1 f2) b1 select' LltNumbered _ = LltNumbered select' _ LltNumbered = LltNumbered select' _ _ = LltBullet diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs index 851aec103..8823befdd 100644 --- a/src/Text/Pandoc/Readers/Org.hs +++ b/src/Text/Pandoc/Readers/Org.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Org - Copyright : Copyright (C) 2014-2020 Albert Krewinkel + Copyright : Copyright (C) 2014-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -18,22 +18,19 @@ import Text.Pandoc.Class.PandocMonad (PandocMonad) import Text.Pandoc.Definition import Text.Pandoc.Options import Text.Pandoc.Parsing (reportLogMessages) -import Text.Pandoc.Shared (crFilter) - +import Text.Pandoc.Sources (ToSources(..), ensureFinalNewlines) import Control.Monad.Except (throwError) import Control.Monad.Reader (runReaderT) -import Data.Text (Text) - -- | Parse org-mode string and return a Pandoc document. -readOrg :: PandocMonad m +readOrg :: (PandocMonad m, ToSources a) => ReaderOptions -- ^ Reader options - -> Text -- ^ String to parse (assuming @'\n'@ line endings) + -> a -> m Pandoc readOrg opts s = do parsed <- flip runReaderT def $ readWithM parseOrg (optionsToParserState opts) - (crFilter s <> "\n\n") + (ensureFinalNewlines 2 (toSources s)) case parsed of Right result -> return result Left e -> throwError e diff --git a/src/Text/Pandoc/Readers/Org/BlockStarts.hs b/src/Text/Pandoc/Readers/Org/BlockStarts.hs index 8f7cac6ea..14233569c 100644 --- a/src/Text/Pandoc/Readers/Org/BlockStarts.hs +++ b/src/Text/Pandoc/Readers/Org/BlockStarts.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Org.BlockStarts - Copyright : Copyright (C) 2014-2020 Albert Krewinkel + Copyright : Copyright (C) 2014-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Readers/Org/Blocks.hs b/src/Text/Pandoc/Readers/Org/Blocks.hs index f2e8b1ab6..f18d2f9a7 100644 --- a/src/Text/Pandoc/Readers/Org/Blocks.hs +++ b/src/Text/Pandoc/Readers/Org/Blocks.hs @@ -1,9 +1,10 @@ {-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE RecordWildCards #-} {- | Module : Text.Pandoc.Readers.Org.Blocks - Copyright : Copyright (C) 2014-2020 Albert Krewinkel + Copyright : Copyright (C) 2014-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -38,10 +39,12 @@ import Data.Functor (($>)) import Data.List (foldl', intersperse) import Data.Maybe (fromMaybe, isJust, isNothing) import Data.Text (Text) - +import Data.List.NonEmpty (nonEmpty) +import System.FilePath import qualified Data.Text as T import qualified Text.Pandoc.Builder as B import qualified Text.Pandoc.Walk as Walk +import Text.Pandoc.Sources (ToSources(..)) -- -- parsing blocks @@ -294,24 +297,22 @@ verseBlock blockType = try $ do codeBlock :: PandocMonad m => BlockAttributes -> Text -> OrgParser m (F Blocks) codeBlock blockAttrs blockType = do skipSpaces - (classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders) - content <- rawBlockContent blockType - resultsContent <- option mempty babelResultsBlock - let id' = fromMaybe mempty $ blockAttrName blockAttrs - let codeBlck = B.codeBlockWith ( id', classes, kv ) content - let labelledBlck = maybe (pure codeBlck) - (labelDiv codeBlck) - (blockAttrCaption blockAttrs) + (classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders) + content <- rawBlockContent blockType + resultsContent <- option mempty babelResultsBlock + let identifier = fromMaybe mempty $ blockAttrName blockAttrs + let codeBlk = B.codeBlockWith (identifier, classes, kv) content + let wrap = maybe pure addCaption (blockAttrCaption blockAttrs) return $ - (if exportsCode kv then labelledBlck else mempty) <> + (if exportsCode kv then wrap codeBlk else mempty) <> (if exportsResults kv then resultsContent else mempty) where - labelDiv :: Blocks -> F Inlines -> F Blocks - labelDiv blk value = - B.divWith nullAttr <$> (mappend <$> labelledBlock value <*> pure blk) + addCaption :: F Inlines -> Blocks -> F Blocks + addCaption caption blk = B.divWith ("", ["captioned-content"], []) + <$> (mkCaptionBlock caption <> pure blk) - labelledBlock :: F Inlines -> F Blocks - labelledBlock = fmap (B.plain . B.spanWith ("", ["label"], [])) + mkCaptionBlock :: F Inlines -> F Blocks + mkCaptionBlock = fmap (B.divWith ("", ["caption"], []) . B.plain) exportsResults :: [(Text, Text)] -> Bool exportsResults = maybe False (`elem` ["results", "both"]) . lookup "exports" @@ -527,7 +528,9 @@ include = try $ do _ -> nullAttr return $ pure . B.codeBlockWith attr <$> parseRaw _ -> return $ return . B.fromList . blockFilter params <$> blockList - insertIncludedFileF blocksParser ["."] filename + currentDir <- takeDirectory . sourceName <$> getPosition + insertIncludedFile blocksParser toSources + [currentDir] filename Nothing Nothing where includeTarget :: PandocMonad m => OrgParser m FilePath includeTarget = do @@ -543,8 +546,7 @@ include = try $ do in case (minlvl >>= safeRead :: Maybe Int) of Nothing -> blks Just lvl -> let levels = Walk.query headerLevel blks - -- CAVE: partial function in else - curMin = if null levels then 0 else minimum levels + curMin = maybe 0 minimum $ nonEmpty levels in Walk.walk (shiftHeader (curMin - lvl)) blks headerLevel :: Block -> [Int] @@ -852,16 +854,52 @@ definitionListItem parseIndentedMarker = try $ do definitionMarker = spaceChar *> string "::" <* (spaceChar <|> lookAhead newline) +-- | Checkbox for tasks. +data Checkbox + = UncheckedBox + | CheckedBox + | SemicheckedBox + +-- | Parses a checkbox in a plain list. +checkbox :: PandocMonad m + => OrgParser m Checkbox +checkbox = do + guardEnabled Ext_task_lists + try (char '[' *> status <* char ']') <?> "checkbox" + where + status = choice + [ UncheckedBox <$ char ' ' + , CheckedBox <$ char 'X' + , SemicheckedBox <$ char '-' + ] + +checkboxToInlines :: Checkbox -> Inline +checkboxToInlines = B.Str . \case + UncheckedBox -> "☐" + SemicheckedBox -> "☐" + CheckedBox -> "☒" + -- | parse raw text for one list item listItem :: PandocMonad m => OrgParser m Int -> OrgParser m (F Blocks) listItem parseIndentedMarker = try . withContext ListItemState $ do markerLength <- try parseIndentedMarker + box <- optionMaybe checkbox firstLine <- anyLineNewline blank <- option "" ("\n" <$ blankline) rest <- T.concat <$> many (listContinuation markerLength) - parseFromString blocks $ firstLine <> blank <> rest + contents <- parseFromString blocks $ firstLine <> blank <> rest + return (maybe id (prependInlines . checkboxToInlines) box <$> contents) + +-- | Prepend inlines to blocks, adding them to the first paragraph or +-- creating a new Plain element if necessary. +prependInlines :: Inline -> Blocks -> Blocks +prependInlines inlns = B.fromList . prepend . B.toList + where + prepend (Plain is : bs) = Plain (inlns : Space : is) : bs + prepend (Para is : bs) = Para (inlns : Space : is) : bs + prepend bs = Plain [inlns, Space] : bs -- continuation of a list item - indented and separated by blankline or endline. -- Note: nested lists are parsed as continuations. diff --git a/src/Text/Pandoc/Readers/Org/DocumentTree.hs b/src/Text/Pandoc/Readers/Org/DocumentTree.hs index 3b363270c..2dcbecb1d 100644 --- a/src/Text/Pandoc/Readers/Org/DocumentTree.hs +++ b/src/Text/Pandoc/Readers/Org/DocumentTree.hs @@ -3,7 +3,7 @@ {-# LANGUAGE TupleSections #-} {- | Module : Text.Pandoc.Readers.Org.DocumentTree - Copyright : Copyright (C) 2014-2020 Albert Krewinkel + Copyright : Copyright (C) 2014-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Readers/Org/ExportSettings.hs b/src/Text/Pandoc/Readers/Org/ExportSettings.hs index 9399ebd54..401e1bd8f 100644 --- a/src/Text/Pandoc/Readers/Org/ExportSettings.hs +++ b/src/Text/Pandoc/Readers/Org/ExportSettings.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Org.ExportSettings - Copyright : © 2016–2020 Albert Krewinkel + Copyright : © 2016-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Readers/Org/Inlines.hs b/src/Text/Pandoc/Readers/Org/Inlines.hs index b234bee58..6862dd71e 100644 --- a/src/Text/Pandoc/Readers/Org/Inlines.hs +++ b/src/Text/Pandoc/Readers/Org/Inlines.hs @@ -2,7 +2,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Org.Inlines - Copyright : Copyright (C) 2014-2020 Albert Krewinkel + Copyright : Copyright (C) 2014-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -29,6 +29,7 @@ import Text.Pandoc.Definition import Text.Pandoc.Options import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline) import Text.TeXMath (DisplayType (..), readTeX, writePandoc) +import Text.Pandoc.Sources (ToSources(..)) import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap import Control.Monad (guard, mplus, mzero, unless, void, when) @@ -262,7 +263,7 @@ berkeleyCitationList = try $ do where citationListPart :: PandocMonad m => OrgParser m (F Inlines) citationListPart = fmap (trimInlinesF . mconcat) . try . many1 $ do - notFollowedBy' citeKey + notFollowedBy' $ citeKey False notFollowedBy (oneOf ";]") inline @@ -277,7 +278,7 @@ berkeleyBareTag' = try $ void (string "cite") berkeleyTextualCite :: PandocMonad m => OrgParser m (F [Citation]) berkeleyTextualCite = try $ do - (suppressAuthor, key) <- citeKey + (suppressAuthor, key) <- citeKey False returnF . return $ Citation { citationId = key , citationPrefix = mempty @@ -322,7 +323,7 @@ linkLikeOrgRefCite = try $ do -- from the `org-ref-cite-re` variable in `org-ref.el`. orgRefCiteKey :: PandocMonad m => OrgParser m Text orgRefCiteKey = - let citeKeySpecialChars = "-_:\\./," :: String + let citeKeySpecialChars = "-_:\\./" :: String isCiteKeySpecialChar c = c `elem` citeKeySpecialChars isCiteKeyChar c = isAlphaNum c || isCiteKeySpecialChar c endOfCitation = try $ do @@ -350,7 +351,7 @@ citeList = sequence <$> sepEndBy1 citation (try $ char ';' *> skipSpaces) citation :: PandocMonad m => OrgParser m (F Citation) citation = try $ do pref <- prefix - (suppress_author, key) <- citeKey + (suppress_author, key) <- citeKey False suff <- suffix return $ do x <- pref @@ -367,7 +368,7 @@ citation = try $ do } where prefix = trimInlinesF . mconcat <$> - manyTill inline (char ']' <|> (']' <$ lookAhead citeKey)) + manyTill inline (char ']' <|> (']' <$ lookAhead (citeKey False))) suffix = try $ do hasSpace <- option False (notFollowedBy nonspaceChar >> return True) skipSpaces @@ -477,17 +478,17 @@ linkToInlinesF linkStr = internalLink :: Text -> Inlines -> F Inlines internalLink link title = do - anchorB <- (link `elem`) <$> asksF orgStateAnchorIds - if anchorB + ids <- asksF orgStateAnchorIds + if link `elem` ids then return $ B.link ("#" <> link) "" title - else return $ B.emph title + else let attr' = ("", ["spurious-link"] , [("target", link)]) + in return $ B.spanWith attr' (B.emph title) -- | Parse an anchor like @<<anchor-id>>@ and return an empty span with -- @anchor-id@ set as id. Legal anchors in org-mode are defined through -- @org-target-regexp@, which is fairly liberal. Since no link is created if -- @anchor-id@ contains spaces, we are more restrictive in what is accepted as -- an anchor. - anchor :: PandocMonad m => OrgParser m (F Inlines) anchor = try $ do anchorId <- parseAnchor @@ -501,7 +502,6 @@ anchor = try $ do -- | Replace every char but [a-zA-Z0-9_.-:] with a hyphen '-'. This mirrors -- the org function @org-export-solidify-link-text@. - solidify :: Text -> Text solidify = T.map replaceSpecialChar where replaceSpecialChar c @@ -573,7 +573,7 @@ underline :: PandocMonad m => OrgParser m (F Inlines) underline = fmap B.underline <$> emphasisBetween '_' verbatim :: PandocMonad m => OrgParser m (F Inlines) -verbatim = return . B.code <$> verbatimBetween '=' +verbatim = return . B.codeWith ("", ["verbatim"], []) <$> verbatimBetween '=' code :: PandocMonad m => OrgParser m (F Inlines) code = return . B.code <$> verbatimBetween '~' @@ -803,7 +803,7 @@ inlineLaTeX = try $ do parseAsInlineLaTeX :: PandocMonad m => Text -> TeXExport -> OrgParser m (Maybe Inlines) parseAsInlineLaTeX cs = \case - TeXExport -> maybeRight <$> runParserT inlineCommand state "" cs + TeXExport -> maybeRight <$> runParserT inlineCommand state "" (toSources cs) TeXIgnore -> return (Just mempty) TeXVerbatim -> return (Just $ B.str cs) diff --git a/src/Text/Pandoc/Readers/Org/Meta.hs b/src/Text/Pandoc/Readers/Org/Meta.hs index 4864d9478..a1b21046a 100644 --- a/src/Text/Pandoc/Readers/Org/Meta.hs +++ b/src/Text/Pandoc/Readers/Org/Meta.hs @@ -3,7 +3,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Org.Meta - Copyright : Copyright (C) 2014-2020 Albert Krewinkel + Copyright : Copyright (C) 2014-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -239,7 +239,7 @@ lineOfInlines = do todoSequence :: Monad m => OrgParser m TodoSequence todoSequence = try $ do todoKws <- todoKeywords - doneKws <- optionMaybe $ todoDoneSep *> todoKeywords + doneKws <- optionMaybe $ todoDoneSep *> doneKeywords newline -- There must be at least one DONE keyword. The last TODO keyword is -- taken if necessary. @@ -250,11 +250,17 @@ todoSequence = try $ do (x:xs) -> return $ keywordsToSequence (reverse xs) [x] where + todoKeyword :: Monad m => OrgParser m Text + todoKeyword = many1Char nonspaceChar <* skipSpaces + todoKeywords :: Monad m => OrgParser m [Text] todoKeywords = try $ - let keyword = many1Char nonspaceChar <* skipSpaces - endOfKeywords = todoDoneSep <|> void newline - in manyTill keyword (lookAhead endOfKeywords) + let endOfKeywords = todoDoneSep <|> void newline + in manyTill todoKeyword (lookAhead endOfKeywords) + + doneKeywords :: Monad m => OrgParser m [Text] + doneKeywords = try $ + manyTill (todoKeyword <* optional todoDoneSep) (lookAhead newline) todoDoneSep :: Monad m => OrgParser m () todoDoneSep = void . try $ skipSpaces *> char '|' <* skipSpaces1 diff --git a/src/Text/Pandoc/Readers/Org/ParserState.hs b/src/Text/Pandoc/Readers/Org/ParserState.hs index 1e4799e7b..abe8a9ebf 100644 --- a/src/Text/Pandoc/Readers/Org/ParserState.hs +++ b/src/Text/Pandoc/Readers/Org/ParserState.hs @@ -3,7 +3,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Org.ParserState - Copyright : Copyright (C) 2014-2020 Albert Krewinkel + Copyright : Copyright (C) 2014-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Readers/Org/Parsing.hs b/src/Text/Pandoc/Readers/Org/Parsing.hs index bce71c24d..f0949e205 100644 --- a/src/Text/Pandoc/Readers/Org/Parsing.hs +++ b/src/Text/Pandoc/Readers/Org/Parsing.hs @@ -1,6 +1,6 @@ {- | Module : Text.Pandoc.Readers.Org.Parsing - Copyright : Copyright (C) 2014-2020 Albert Krewinkel + Copyright : Copyright (C) 2014-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -63,8 +63,7 @@ module Text.Pandoc.Readers.Org.Parsing , ellipses , citeKey , gridTableWith - , insertIncludedFileF - -- * Re-exports from Text.Pandoc.Parsec + , insertIncludedFile , runParser , runParserT , getInput @@ -100,21 +99,22 @@ module Text.Pandoc.Readers.Org.Parsing , getState , updateState , SourcePos + , sourceName , getPosition ) where import Data.Text (Text) import Text.Pandoc.Readers.Org.ParserState -import Text.Pandoc.Parsing hiding (F, anyLine, blanklines, newline, - parseFromString) +import Text.Pandoc.Parsing hiding (anyLine, blanklines, newline, + parseFromString) import qualified Text.Pandoc.Parsing as P import Control.Monad (guard) import Control.Monad.Reader (ReaderT) -- | The parser used to read org files. -type OrgParser m = ParserT Text OrgParserState (ReaderT OrgParserLocal m) +type OrgParser m = ParserT Sources OrgParserState (ReaderT OrgParserLocal m) -- -- Adaptions and specializations of parsing utilities diff --git a/src/Text/Pandoc/Readers/Org/Shared.hs b/src/Text/Pandoc/Readers/Org/Shared.hs index 7f72077a4..ad7c65060 100644 --- a/src/Text/Pandoc/Readers/Org/Shared.hs +++ b/src/Text/Pandoc/Readers/Org/Shared.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Org.Shared - Copyright : Copyright (C) 2014-2020 Albert Krewinkel + Copyright : Copyright (C) 2014-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs index eeb3d1389..3990f0cb5 100644 --- a/src/Text/Pandoc/Readers/RST.hs +++ b/src/Text/Pandoc/Readers/RST.hs @@ -4,7 +4,7 @@ {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Readers.RST - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -27,8 +27,7 @@ import Data.Text (Text) import qualified Data.Text as T import Text.Pandoc.Builder (Blocks, Inlines, fromList, setMeta, trimInlines) import qualified Text.Pandoc.Builder as B -import Text.Pandoc.Class.PandocMonad (PandocMonad, fetchItem, - readFileFromDirs, getCurrentTime) +import Text.Pandoc.Class.PandocMonad (PandocMonad, fetchItem, getTimestamp) import Text.Pandoc.CSV (CSVOptions (..), defaultCSVOptions, parseCSV) import Text.Pandoc.Definition import Text.Pandoc.Error @@ -38,25 +37,25 @@ import Text.Pandoc.Options import Text.Pandoc.Parsing import Text.Pandoc.Shared import qualified Text.Pandoc.UTF8 as UTF8 -import Text.Printf (printf) import Data.Time.Format +import System.FilePath (takeDirectory) -- TODO: -- [ ] .. parsed-literal -- | Parse reStructuredText string and return Pandoc document. -readRST :: PandocMonad m +readRST :: (PandocMonad m, ToSources a) => ReaderOptions -- ^ Reader options - -> Text -- ^ Text to parse (assuming @'\n'@ line endings) + -> a -> m Pandoc readRST opts s = do parsed <- readWithM parseRST def{ stateOptions = opts } - (crFilter s <> "\n\n") + (ensureFinalNewlines 2 (toSources s)) case parsed of Right result -> return result Left e -> throwError e -type RSTParser m = ParserT Text ParserState m +type RSTParser m = ParserT Sources ParserState m -- -- Constants and data structure definitions @@ -151,11 +150,19 @@ parseRST = do startPos <- getPosition -- go through once just to get list of reference keys and notes -- docMinusKeys is the raw document with blanks where the keys were... - docMinusKeys <- T.concat <$> - manyTill (referenceKey <|> anchorDef <|> - noteBlock <|> citationBlock <|> - (snd <$> withRaw comment) <|> - headerBlock <|> lineClump) eof + let chunk = referenceKey + <|> anchorDef + <|> noteBlock + <|> citationBlock + <|> (snd <$> withRaw comment) + <|> headerBlock + <|> lineClump + docMinusKeys <- Sources <$> + manyTill (do pos <- getPosition + t <- chunk + return (pos, t)) eof + -- UGLY: we collapse source position information. + -- TODO: fix the parser to use the F monad instead of two passes setInput docMinusKeys setPosition startPos st' <- getState @@ -348,7 +355,7 @@ singleHeader' = try $ do -- hrule block -- -hrule :: Monad m => ParserT Text st m Blocks +hrule :: Monad m => ParserT Sources st m Blocks hrule = try $ do chr <- oneOf underlineChars count 3 (char chr) @@ -363,7 +370,7 @@ hrule = try $ do -- read a line indented by a given string indentedLine :: (HasReaderOptions st, Monad m) - => Int -> ParserT Text st m Text + => Int -> ParserT Sources st m Text indentedLine indents = try $ do lookAhead spaceChar gobbleAtMostSpaces indents @@ -372,7 +379,7 @@ indentedLine indents = try $ do -- one or more indented lines, possibly separated by blank lines. -- any amount of indentation will work. indentedBlock :: (HasReaderOptions st, Monad m) - => ParserT Text st m Text + => ParserT Sources st m Text indentedBlock = try $ do indents <- length <$> lookAhead (many1 spaceChar) lns <- many1 $ try $ do b <- option "" blanklines @@ -381,20 +388,20 @@ indentedBlock = try $ do optional blanklines return $ T.unlines lns -quotedBlock :: Monad m => ParserT Text st m Text +quotedBlock :: Monad m => ParserT Sources st m Text quotedBlock = try $ do quote <- lookAhead $ oneOf "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" lns <- many1 $ lookAhead (char quote) >> anyLine optional blanklines return $ T.unlines lns -codeBlockStart :: Monad m => ParserT Text st m Char +codeBlockStart :: Monad m => ParserT Sources st m Char codeBlockStart = string "::" >> blankline >> blankline -codeBlock :: Monad m => ParserT Text ParserState m Blocks +codeBlock :: Monad m => ParserT Sources ParserState m Blocks codeBlock = try $ codeBlockStart >> codeBlockBody -codeBlockBody :: Monad m => ParserT Text ParserState m Blocks +codeBlockBody :: Monad m => ParserT Sources ParserState m Blocks codeBlockBody = do lang <- stateRstHighlight <$> getState try $ B.codeBlockWith ("", maybeToList lang, []) . stripTrailingNewlines <$> @@ -410,14 +417,14 @@ lhsCodeBlock = try $ do return $ B.codeBlockWith ("", ["haskell","literate"], []) $ T.intercalate "\n" lns -latexCodeBlock :: Monad m => ParserT Text st m [Text] +latexCodeBlock :: Monad m => ParserT Sources st m [Text] latexCodeBlock = try $ do try (latexBlockLine "\\begin{code}") many1Till anyLine (try $ latexBlockLine "\\end{code}") where latexBlockLine s = skipMany spaceChar >> string s >> blankline -birdCodeBlock :: Monad m => ParserT Text st m [Text] +birdCodeBlock :: Monad m => ParserT Sources st m [Text] birdCodeBlock = filterSpace <$> many1 birdTrackLine where filterSpace lns = -- if (as is normal) there is always a space after >, drop it @@ -425,7 +432,7 @@ birdCodeBlock = filterSpace <$> many1 birdTrackLine then map (T.drop 1) lns else lns -birdTrackLine :: Monad m => ParserT Text st m Text +birdTrackLine :: Monad m => ParserT Sources st m Text birdTrackLine = char '>' >> anyLine -- @@ -446,64 +453,43 @@ encoding -} includeDirective :: PandocMonad m - => Text -> [(Text, Text)] -> Text + => Text + -> [(Text, Text)] + -> Text -> RSTParser m Blocks includeDirective top fields body = do - let f = trim top - guard $ not (T.null f) + let f = T.unpack $ trim top + guard $ not $ null f guard $ T.null (trim body) - -- options - let (startLine :: Maybe Int) = lookup "start-line" fields >>= safeRead - let (endLine :: Maybe Int) = lookup "end-line" fields >>= safeRead - oldPos <- getPosition - oldInput <- getInput - containers <- stateContainers <$> getState - when (f `elem` containers) $ - throwError $ PandocParseError $ "Include file loop at " <> tshow oldPos - updateState $ \s -> s{ stateContainers = f : stateContainers s } - mbContents <- readFileFromDirs ["."] $ T.unpack f - contentLines <- case mbContents of - Just s -> return $ T.lines s - Nothing -> do - logMessage $ CouldNotLoadIncludeFile f oldPos - return [] - let numLines = length contentLines - let startLine' = case startLine of - Nothing -> 1 - Just x | x >= 0 -> x - | otherwise -> numLines + x -- negative from end - let endLine' = case endLine of - Nothing -> numLines + 1 - Just x | x >= 0 -> x - | otherwise -> numLines + x -- negative from end - let contentLines' = drop (startLine' - 1) - $ take (endLine' - 1) contentLines - let contentLines'' = (case trim <$> lookup "end-before" fields of - Just patt -> takeWhile (not . (patt `T.isInfixOf`)) - Nothing -> id) . - (case trim <$> lookup "start-after" fields of - Just patt -> drop 1 . - dropWhile (not . (patt `T.isInfixOf`)) - Nothing -> id) $ contentLines' - let contents' = T.unlines contentLines'' - case lookup "code" fields of - Just lang -> do - let classes = maybe [] T.words (lookup "class" fields) - let ident = maybe "" trimr $ lookup "name" fields - codeblock ident classes fields (trimr lang) contents' False - Nothing -> case lookup "literal" fields of - Just _ -> return $ B.rawBlock "rst" contents' - Nothing -> do - setPosition $ newPos (T.unpack f) 1 1 - setInput $ contents' <> "\n" - bs <- optional blanklines >> - (mconcat <$> many block) - setInput oldInput - setPosition oldPos - updateState $ \s -> s{ stateContainers = - tail $ stateContainers s } - return bs - + let startLine = lookup "start-line" fields >>= safeRead + let endLine = lookup "end-line" fields >>= safeRead + let classes = maybe [] T.words (lookup "class" fields) + let ident = maybe "" trimr $ lookup "name" fields + let parser = + case lookup "code" fields of + Just lang -> + (codeblock ident classes fields (trimr lang) False + . sourcesToText) <$> getInput + Nothing -> + case lookup "literal" fields of + Just _ -> B.rawBlock "rst" . sourcesToText <$> getInput + Nothing -> parseBlocks + let isLiteral = isJust (lookup "code" fields `mplus` lookup "literal" fields) + let selectLines = + (case trim <$> lookup "end-before" fields of + Just patt -> takeWhile (not . (patt `T.isInfixOf`)) + Nothing -> id) . + (case trim <$> lookup "start-after" fields of + Just patt -> drop 1 . + dropWhile (not . (patt `T.isInfixOf`)) + Nothing -> id) + + let toStream t = + Sources [(initialPos f, + (T.unlines . selectLines . T.lines $ t) <> + if isLiteral then mempty else "\n")] -- see #7436 + currentDir <- takeDirectory . sourceName <$> getPosition + insertIncludedFile parser toStream [currentDir] f startLine endLine -- -- list blocks @@ -526,7 +512,7 @@ definitionList :: PandocMonad m => RSTParser m Blocks definitionList = B.definitionList <$> many1 definitionListItem -- parses bullet list start and returns its length (inc. following whitespace) -bulletListStart :: Monad m => ParserT Text st m Int +bulletListStart :: Monad m => ParserT Sources st m Int bulletListStart = try $ do notFollowedBy' hrule -- because hrules start out just like lists marker <- oneOf bulletListMarkers @@ -613,8 +599,9 @@ comment = try $ do string ".." skipMany1 spaceChar <|> (() <$ lookAhead newline) -- notFollowedBy' directiveLabel -- comment comes after directive so unnec. - manyTill anyChar blanklines + _ <- anyLine optional indentedBlock + optional blanklines return mempty directiveLabel :: Monad m => RSTParser m Text @@ -685,7 +672,7 @@ directive' = do "replace" -> B.para <$> -- consumed by substKey parseInlineFromText (trim top) "date" -> B.para <$> do -- consumed by substKey - t <- getCurrentTime + t <- getTimestamp let format = case T.unpack (T.strip top) of [] -> "%Y-%m-%d" x -> x @@ -731,8 +718,8 @@ directive' = do "" -> stateRstHighlight def lang -> Just lang }) x | x == "code" || x == "code-block" || x == "sourcecode" -> - codeblock name classes (map (second trimr) fields) - (trim top) body True + return $ codeblock name classes (map (second trimr) fields) + (trim top) True body "aafig" -> do let attribs = (name, ["aafig"], map (second trimr) fields) return $ B.codeBlockWith attribs $ stripTrailingNewlines body @@ -877,10 +864,11 @@ csvTableDirective top fields rawcsv = do (bs, _) <- fetchItem u return $ UTF8.toText bs Nothing -> return rawcsv - let res = parseCSV opts (case explicitHeader of - Just h -> h <> "\n" <> rawcsv' - Nothing -> rawcsv') - case res of + let header' = case explicitHeader of + Just h -> parseCSV defaultCSVOptions h + Nothing -> Right [] + let res = parseCSV opts rawcsv' + case (<>) <$> header' <*> res of Left e -> throwError $ PandocParsecError "csv table" e Right rawrows -> do @@ -1017,10 +1005,10 @@ toChunks = dropWhile T.null then "\\begin{aligned}\n" <> s <> "\n\\end{aligned}" else s -codeblock :: Text -> [Text] -> [(Text, Text)] -> Text -> Text -> Bool - -> RSTParser m Blocks -codeblock ident classes fields lang body rmTrailingNewlines = - return $ B.codeBlockWith attribs $ stripTrailingNewlines' body +codeblock :: Text -> [Text] -> [(Text, Text)] -> Text -> Bool -> Text + -> Blocks +codeblock ident classes fields lang rmTrailingNewlines body = + B.codeBlockWith attribs $ stripTrailingNewlines' body where stripTrailingNewlines' = if rmTrailingNewlines then stripTrailingNewlines else id @@ -1101,7 +1089,7 @@ quotedReferenceName = try $ do -- plus isolated (no two adjacent) internal hyphens, underscores, -- periods, colons and plus signs; no whitespace or other characters -- are allowed. -simpleReferenceName :: Monad m => ParserT Text st m Text +simpleReferenceName :: Monad m => ParserT Sources st m Text simpleReferenceName = do x <- alphaNum xs <- many $ alphaNum @@ -1120,7 +1108,7 @@ referenceKey = do -- return enough blanks to replace key return $ T.replicate (sourceLine endPos - sourceLine startPos) "\n" -targetURI :: Monad m => ParserT Text st m Text +targetURI :: Monad m => ParserT Sources st m Text targetURI = do skipSpaces optional $ try $ newline >> notFollowedBy blankline @@ -1158,8 +1146,10 @@ anonymousKey :: Monad m => RSTParser m () anonymousKey = try $ do oneOfStrings [".. __:", "__"] src <- targetURI - pos <- getPosition - let key = toKey $ "_" <> T.pack (printf "%09d" (sourceLine pos)) + -- we need to ensure that the keys are ordered by occurrence in + -- the document. + numKeys <- M.size . stateKeys <$> getState + let key = toKey $ "_" <> T.pack (show numKeys) updateState $ \s -> s { stateKeys = M.insert key ((src,""), nullAttr) $ stateKeys s } @@ -1248,13 +1238,13 @@ headerBlock = do -- Grid tables TODO: -- - column spans -dashedLine :: Monad m => Char -> ParserT Text st m (Int, Int) +dashedLine :: Monad m => Char -> ParserT Sources st m (Int, Int) dashedLine ch = do dashes <- many1 (char ch) sp <- many (char ' ') return (length dashes, length $ dashes ++ sp) -simpleDashedLines :: Monad m => Char -> ParserT Text st m [(Int,Int)] +simpleDashedLines :: Monad m => Char -> ParserT Sources st m [(Int,Int)] simpleDashedLines ch = try $ many1 (dashedLine ch) -- Parse a table row separator @@ -1380,7 +1370,7 @@ hyphens = do -- don't want to treat endline after hyphen or dash as a space return $ B.str result -escapedChar :: Monad m => ParserT Text st m Inlines +escapedChar :: Monad m => ParserT Sources st m Inlines escapedChar = do c <- escaped anyChar return $ if c == ' ' || c == '\n' || c == '\r' -- '\ ' is null in RST @@ -1656,21 +1646,4 @@ note = try $ do return $ B.note contents smart :: PandocMonad m => RSTParser m Inlines -smart = do - guardEnabled Ext_smart - doubleQuoted <|> singleQuoted <|> - choice [apostrophe, dash, ellipses] - -singleQuoted :: PandocMonad m => RSTParser m Inlines -singleQuoted = try $ do - singleQuoteStart - withQuoteContext InSingleQuote $ - B.singleQuoted . trimInlines . mconcat <$> - many1Till inline singleQuoteEnd - -doubleQuoted :: PandocMonad m => RSTParser m Inlines -doubleQuoted = try $ do - doubleQuoteStart - withQuoteContext InDoubleQuote $ - B.doubleQuoted . trimInlines . mconcat <$> - many1Till inline doubleQuoteEnd +smart = smartPunctuation inline diff --git a/src/Text/Pandoc/Readers/Roff.hs b/src/Text/Pandoc/Readers/Roff.hs index 509ce1377..47f16ef4b 100644 --- a/src/Text/Pandoc/Readers/Roff.hs +++ b/src/Text/Pandoc/Readers/Roff.hs @@ -42,7 +42,6 @@ import Text.Pandoc.Logging (LogMessage(..)) import Text.Pandoc.Options import Text.Pandoc.Parsing import Text.Pandoc.Shared (safeRead) -import Text.Parsec hiding (tokenPrim) import Text.Pandoc.RoffChar (characterCodes, combiningAccents) import qualified Data.Sequence as Seq import qualified Data.Foldable as Foldable @@ -122,16 +121,16 @@ instance Default RoffState where , afterConditional = False } -type RoffLexer m = ParserT T.Text RoffState m +type RoffLexer m = ParserT Sources RoffState m -- -- Lexer: T.Text -> RoffToken -- -eofline :: Stream s m Char => ParsecT s u m () +eofline :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s u m () eofline = void newline <|> eof <|> () <$ lookAhead (string "\\}") -spacetab :: Stream s m Char => ParsecT s u m Char +spacetab :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s u m Char spacetab = char ' ' <|> char '\t' characterCodeMap :: M.Map T.Text Char @@ -303,8 +302,7 @@ expandString = try $ do char '*' cs <- escapeArg <|> countChar 1 anyChar s <- linePartsToText <$> resolveText cs pos - getInput >>= setInput . (s <>) - return () + addToInput s -- Parses: '..' quoteArg :: PandocMonad m => RoffLexer m T.Text @@ -316,7 +314,7 @@ escFont = do font' <- if T.null font || font == "P" then prevFont <$> getState else return $ foldr processFontLetter defaultFontSpec $ T.unpack font - modifyState $ \st -> st{ prevFont = currentFont st + updateState $ \st -> st{ prevFont = currentFont st , currentFont = font' } return [Font font'] where @@ -372,8 +370,8 @@ lexTable pos = do spaces opts <- try tableOptions <|> [] <$ optional (char ';') case lookup "tab" opts of - Just (T.uncons -> Just (c, _)) -> modifyState $ \st -> st{ tableTabChar = c } - _ -> modifyState $ \st -> st{ tableTabChar = '\t' } + Just (T.uncons -> Just (c, _)) -> updateState $ \st -> st{ tableTabChar = c } + _ -> updateState $ \st -> st{ tableTabChar = '\t' } spaces skipMany lexComment spaces @@ -489,18 +487,18 @@ lexConditional mname = do ifPart <- do optional $ try $ char '\\' >> newline lexGroup - <|> do modifyState $ \s -> s{ afterConditional = True } + <|> do updateState $ \s -> s{ afterConditional = True } t <- manToken - modifyState $ \s -> s{ afterConditional = False } + updateState $ \s -> s{ afterConditional = False } return t case mbtest of Nothing -> do - putState st -- reset state, so we don't record macros in skipped section + setState st -- reset state, so we don't record macros in skipped section report $ SkippedContent (T.cons '.' mname) pos return mempty Just True -> return ifPart Just False -> do - putState st + setState st return mempty expression :: PandocMonad m => RoffLexer m (Maybe Bool) @@ -515,7 +513,7 @@ expression = do _ -> Nothing where returnValue v = do - modifyState $ \st -> st{ lastExpression = v } + updateState $ \st -> st{ lastExpression = v } return v lexGroup :: PandocMonad m => RoffLexer m RoffTokens @@ -536,7 +534,7 @@ lexIncludeFile args = do result <- readFileFromDirs dirs $ T.unpack fp case result of Nothing -> report $ CouldNotLoadIncludeFile fp pos - Just s -> getInput >>= setInput . (s <>) + Just s -> addToInput s return mempty [] -> return mempty @@ -564,13 +562,13 @@ lexStringDef args = do -- string definition (x:ys) -> do let ts = singleTok $ TextLine (intercalate [RoffStr " " ] ys) let stringName = linePartsToText x - modifyState $ \st -> + updateState $ \st -> st{ customMacros = M.insert stringName ts (customMacros st) } return mempty lexMacroDef :: PandocMonad m => [Arg] -> RoffLexer m RoffTokens lexMacroDef args = do -- macro definition - modifyState $ \st -> st{ roffMode = CopyMode } + updateState $ \st -> st{ roffMode = CopyMode } (macroName, stopMacro) <- case args of (x : y : _) -> return (linePartsToText x, linePartsToText y) @@ -584,7 +582,7 @@ lexMacroDef args = do -- macro definition _ <- lexArgs return () ts <- mconcat <$> manyTill manToken stop - modifyState $ \st -> + updateState $ \st -> st{ customMacros = M.insert macroName ts (customMacros st) , roffMode = NormalMode } return mempty diff --git a/src/Text/Pandoc/Readers/TWiki.hs b/src/Text/Pandoc/Readers/TWiki.hs index 484a6c923..276d28aaa 100644 --- a/src/Text/Pandoc/Readers/TWiki.hs +++ b/src/Text/Pandoc/Readers/TWiki.hs @@ -28,22 +28,22 @@ import Text.Pandoc.Definition import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (enclosed, nested) import Text.Pandoc.Readers.HTML (htmlTag, isCommentTag) -import Text.Pandoc.Shared (crFilter, tshow) +import Text.Pandoc.Shared (tshow) import Text.Pandoc.XML (fromEntities) -- | Read twiki from an input string and return a Pandoc document. -readTWiki :: PandocMonad m +readTWiki :: (PandocMonad m, ToSources a) => ReaderOptions - -> Text + -> a -> m Pandoc readTWiki opts s = do - res <- readWithM parseTWiki def{ stateOptions = opts } - (crFilter s <> "\n\n") + let sources = ensureFinalNewlines 2 (toSources s) + res <- readWithM parseTWiki def{ stateOptions = opts } sources case res of Left e -> throwError e Right d -> return d -type TWParser = ParserT Text ParserState +type TWParser = ParserT Sources ParserState -- -- utility functions @@ -469,27 +469,7 @@ symbol :: PandocMonad m => TWParser m B.Inlines symbol = B.str <$> countChar 1 nonspaceChar smart :: PandocMonad m => TWParser m B.Inlines -smart = do - guardEnabled Ext_smart - doubleQuoted <|> singleQuoted <|> - choice [ apostrophe - , dash - , ellipses - ] - -singleQuoted :: PandocMonad m => TWParser m B.Inlines -singleQuoted = try $ do - singleQuoteStart - withQuoteContext InSingleQuote - (B.singleQuoted . B.trimInlines . mconcat <$> many1Till inline singleQuoteEnd) - -doubleQuoted :: PandocMonad m => TWParser m B.Inlines -doubleQuoted = try $ do - doubleQuoteStart - contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline) - withQuoteContext InDoubleQuote (doubleQuoteEnd >> - return (B.doubleQuoted $ B.trimInlines contents)) - <|> return (B.str "\8220" B.<> contents) +smart = smartPunctuation inline link :: PandocMonad m => TWParser m B.Inlines link = try $ do diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs index 6691d8381..981878206 100644 --- a/src/Text/Pandoc/Readers/Textile.hs +++ b/src/Text/Pandoc/Readers/Textile.hs @@ -3,7 +3,7 @@ {- | Module : Text.Pandoc.Readers.Textile Copyright : Copyright (C) 2010-2012 Paul Rivier - 2010-2020 John MacFarlane + 2010-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : Paul Rivier <paul*rivier#demotera*com> @@ -11,7 +11,7 @@ Portability : portable Conversion from Textile to 'Pandoc' document, based on the spec -available at http://redcloth.org/textile. +available at https://www.promptworks.com/textile/. Implemented and parsed: - Paragraphs @@ -38,7 +38,8 @@ module Text.Pandoc.Readers.Textile ( readTextile) where import Control.Monad (guard, liftM) import Control.Monad.Except (throwError) import Data.Char (digitToInt, isUpper) -import Data.List (intersperse, transpose) +import Data.List (intersperse, transpose, foldl') +import Data.List.NonEmpty (NonEmpty(..), nonEmpty) import Data.Text (Text) import qualified Data.Text as T import Text.HTML.TagSoup (Tag (..), fromAttrib) @@ -52,30 +53,34 @@ import Text.Pandoc.Options import Text.Pandoc.Parsing import Text.Pandoc.Readers.HTML (htmlTag, isBlockTag, isInlineTag) import Text.Pandoc.Readers.LaTeX (rawLaTeXBlock, rawLaTeXInline) -import Text.Pandoc.Shared (crFilter, trim, tshow) +import Text.Pandoc.Shared (trim, tshow) -- | Parse a Textile text and return a Pandoc document. -readTextile :: PandocMonad m +readTextile :: (PandocMonad m, ToSources a) => ReaderOptions -- ^ Reader options - -> Text -- ^ String to parse (assuming @'\n'@ line endings) + -> a -> m Pandoc readTextile opts s = do - parsed <- readWithM parseTextile def{ stateOptions = opts } - (crFilter s <> "\n\n") + let sources = ensureFinalNewlines 2 (toSources s) + parsed <- readWithM parseTextile def{ stateOptions = opts } sources case parsed of Right result -> return result Left e -> throwError e +type TextileParser = ParserT Sources ParserState -- | Generate a Pandoc ADT from a textile document -parseTextile :: PandocMonad m => ParserT Text ParserState m Pandoc +parseTextile :: PandocMonad m => TextileParser m Pandoc parseTextile = do many blankline startPos <- getPosition -- go through once just to get list of reference keys and notes -- docMinusKeys is the raw document with blanks where the keys/notes were... - let firstPassParser = noteBlock <|> lineClump - manyTill firstPassParser eof >>= setInput . T.concat + let firstPassParser = do + pos <- getPosition + t <- noteBlock <|> lineClump + return (pos, t) + manyTill firstPassParser eof >>= setInput . Sources setPosition startPos st' <- getState let reversedNotes = stateNotes st' @@ -83,10 +88,10 @@ parseTextile = do -- now parse it for real... Pandoc nullMeta . B.toList <$> parseBlocks -- FIXME -noteMarker :: PandocMonad m => ParserT Text ParserState m Text +noteMarker :: PandocMonad m => TextileParser m Text noteMarker = skipMany spaceChar >> string "fn" >> T.pack <$> manyTill digit (char '.') -noteBlock :: PandocMonad m => ParserT Text ParserState m Text +noteBlock :: PandocMonad m => TextileParser m Text noteBlock = try $ do startPos <- getPosition ref <- noteMarker @@ -101,11 +106,11 @@ noteBlock = try $ do return $ T.replicate (sourceLine endPos - sourceLine startPos) "\n" -- | Parse document blocks -parseBlocks :: PandocMonad m => ParserT Text ParserState m Blocks +parseBlocks :: PandocMonad m => TextileParser m Blocks parseBlocks = mconcat <$> manyTill block eof -- | Block parsers list tried in definition order -blockParsers :: PandocMonad m => [ParserT Text ParserState m Blocks] +blockParsers :: PandocMonad m => [TextileParser m Blocks] blockParsers = [ codeBlock , header , blockQuote @@ -120,22 +125,22 @@ blockParsers = [ codeBlock ] -- | Any block in the order of definition of blockParsers -block :: PandocMonad m => ParserT Text ParserState m Blocks +block :: PandocMonad m => TextileParser m Blocks block = do res <- choice blockParsers <?> "block" trace (T.take 60 $ tshow $ B.toList res) return res -commentBlock :: PandocMonad m => ParserT Text ParserState m Blocks +commentBlock :: PandocMonad m => TextileParser m Blocks commentBlock = try $ do string "###." manyTill anyLine blanklines return mempty -codeBlock :: PandocMonad m => ParserT Text ParserState m Blocks +codeBlock :: PandocMonad m => TextileParser m Blocks codeBlock = codeBlockTextile <|> codeBlockHtml -codeBlockTextile :: PandocMonad m => ParserT Text ParserState m Blocks +codeBlockTextile :: PandocMonad m => TextileParser m Blocks codeBlockTextile = try $ do string "bc." <|> string "pre." extended <- option False (True <$ char '.') @@ -155,7 +160,7 @@ trimTrailingNewlines :: Text -> Text trimTrailingNewlines = T.dropWhileEnd (=='\n') -- | Code Blocks in Textile are between <pre> and </pre> -codeBlockHtml :: PandocMonad m => ParserT Text ParserState m Blocks +codeBlockHtml :: PandocMonad m => TextileParser m Blocks codeBlockHtml = try $ do (t@(TagOpen _ attrs),_) <- htmlTag (tagOpen (=="pre") (const True)) result' <- T.pack <$> manyTill anyChar (htmlTag (tagClose (=="pre"))) @@ -173,7 +178,7 @@ codeBlockHtml = try $ do return $ B.codeBlockWith (ident,classes,kvs) result''' -- | Header of the form "hN. content" with N in 1..6 -header :: PandocMonad m => ParserT Text ParserState m Blocks +header :: PandocMonad m => TextileParser m Blocks header = try $ do char 'h' level <- digitToInt <$> oneOf "123456" @@ -185,14 +190,14 @@ header = try $ do return $ B.headerWith attr' level name -- | Blockquote of the form "bq. content" -blockQuote :: PandocMonad m => ParserT Text ParserState m Blocks +blockQuote :: PandocMonad m => TextileParser m Blocks blockQuote = try $ do string "bq" >> attributes >> char '.' >> whitespace B.blockQuote <$> para -- Horizontal rule -hrule :: PandocMonad m => ParserT Text st m Blocks +hrule :: PandocMonad m => TextileParser m Blocks hrule = try $ do skipSpaces start <- oneOf "-*" @@ -207,39 +212,39 @@ hrule = try $ do -- | Can be a bullet list or an ordered list. This implementation is -- strict in the nesting, sublist must start at exactly "parent depth -- plus one" -anyList :: PandocMonad m => ParserT Text ParserState m Blocks +anyList :: PandocMonad m => TextileParser m Blocks anyList = try $ anyListAtDepth 1 <* blanklines -- | This allow one type of list to be nested into an other type, -- provided correct nesting -anyListAtDepth :: PandocMonad m => Int -> ParserT Text ParserState m Blocks +anyListAtDepth :: PandocMonad m => Int -> TextileParser m Blocks anyListAtDepth depth = choice [ bulletListAtDepth depth, orderedListAtDepth depth, definitionList ] -- | Bullet List of given depth, depth being the number of leading '*' -bulletListAtDepth :: PandocMonad m => Int -> ParserT Text ParserState m Blocks +bulletListAtDepth :: PandocMonad m => Int -> TextileParser m Blocks bulletListAtDepth depth = try $ B.bulletList <$> many1 (bulletListItemAtDepth depth) -- | Bullet List Item of given depth, depth being the number of -- leading '*' -bulletListItemAtDepth :: PandocMonad m => Int -> ParserT Text ParserState m Blocks +bulletListItemAtDepth :: PandocMonad m => Int -> TextileParser m Blocks bulletListItemAtDepth = genericListItemAtDepth '*' -- | Ordered List of given depth, depth being the number of -- leading '#' -orderedListAtDepth :: PandocMonad m => Int -> ParserT Text ParserState m Blocks +orderedListAtDepth :: PandocMonad m => Int -> TextileParser m Blocks orderedListAtDepth depth = try $ do items <- many1 (orderedListItemAtDepth depth) return $ B.orderedList items -- | Ordered List Item of given depth, depth being the number of -- leading '#' -orderedListItemAtDepth :: PandocMonad m => Int -> ParserT Text ParserState m Blocks +orderedListItemAtDepth :: PandocMonad m => Int -> TextileParser m Blocks orderedListItemAtDepth = genericListItemAtDepth '#' -- | Common implementation of list items -genericListItemAtDepth :: PandocMonad m => Char -> Int -> ParserT Text ParserState m Blocks +genericListItemAtDepth :: PandocMonad m => Char -> Int -> TextileParser m Blocks genericListItemAtDepth c depth = try $ do count depth (char c) >> attributes >> whitespace contents <- mconcat <$> many ((B.plain . mconcat <$> many1 inline) <|> @@ -249,25 +254,25 @@ genericListItemAtDepth c depth = try $ do return $ contents <> sublist -- | A definition list is a set of consecutive definition items -definitionList :: PandocMonad m => ParserT Text ParserState m Blocks +definitionList :: PandocMonad m => TextileParser m Blocks definitionList = try $ B.definitionList <$> many1 definitionListItem -- | List start character. -listStart :: PandocMonad m => ParserT Text ParserState m () +listStart :: PandocMonad m => TextileParser m () listStart = genericListStart '*' <|> () <$ genericListStart '#' <|> () <$ definitionListStart -genericListStart :: PandocMonad m => Char -> ParserT Text st m () +genericListStart :: PandocMonad m => Char -> TextileParser m () genericListStart c = () <$ try (many1 (char c) >> whitespace) -basicDLStart :: PandocMonad m => ParserT Text ParserState m () +basicDLStart :: PandocMonad m => TextileParser m () basicDLStart = do char '-' whitespace notFollowedBy newline -definitionListStart :: PandocMonad m => ParserT Text ParserState m Inlines +definitionListStart :: PandocMonad m => TextileParser m Inlines definitionListStart = try $ do basicDLStart trimInlines . mconcat <$> @@ -280,15 +285,15 @@ definitionListStart = try $ do -- the term defined, then spaces and ":=". The definition follows, on -- the same single line, or spaned on multiple line, after a line -- break. -definitionListItem :: PandocMonad m => ParserT Text ParserState m (Inlines, [Blocks]) +definitionListItem :: PandocMonad m => TextileParser m (Inlines, [Blocks]) definitionListItem = try $ do term <- mconcat . intersperse B.linebreak <$> many1 definitionListStart def' <- string ":=" *> optional whitespace *> (multilineDef <|> inlineDef) return (term, def') - where inlineDef :: PandocMonad m => ParserT Text ParserState m [Blocks] + where inlineDef :: PandocMonad m => TextileParser m [Blocks] inlineDef = liftM (\d -> [B.plain d]) $ optional whitespace >> (trimInlines . mconcat <$> many inline) <* newline - multilineDef :: PandocMonad m => ParserT Text ParserState m [Blocks] + multilineDef :: PandocMonad m => TextileParser m [Blocks] multilineDef = try $ do optional whitespace >> newline s <- T.pack <$> many1Till anyChar (try (string "=:" >> newline)) @@ -299,7 +304,7 @@ definitionListItem = try $ do -- raw content -- | A raw Html Block, optionally followed by blanklines -rawHtmlBlock :: PandocMonad m => ParserT Text ParserState m Blocks +rawHtmlBlock :: PandocMonad m => TextileParser m Blocks rawHtmlBlock = try $ do skipMany spaceChar (_,b) <- htmlTag isBlockTag @@ -307,14 +312,14 @@ rawHtmlBlock = try $ do return $ B.rawBlock "html" b -- | Raw block of LaTeX content -rawLaTeXBlock' :: PandocMonad m => ParserT Text ParserState m Blocks +rawLaTeXBlock' :: PandocMonad m => TextileParser m Blocks rawLaTeXBlock' = do guardEnabled Ext_raw_tex B.rawBlock "latex" <$> (rawLaTeXBlock <* spaces) -- | In textile, paragraphs are separated by blank lines. -para :: PandocMonad m => ParserT Text ParserState m Blocks +para :: PandocMonad m => TextileParser m Blocks para = B.para . trimInlines . mconcat <$> many1 inline -- Tables @@ -325,7 +330,7 @@ toAlignment '>' = AlignRight toAlignment '=' = AlignCenter toAlignment _ = AlignDefault -cellAttributes :: PandocMonad m => ParserT Text ParserState m (Bool, Alignment) +cellAttributes :: PandocMonad m => TextileParser m (Bool, Alignment) cellAttributes = try $ do isHeader <- option False (True <$ char '_') -- we just ignore colspan and rowspan markers: @@ -338,7 +343,7 @@ cellAttributes = try $ do return (isHeader, alignment) -- | A table cell spans until a pipe | -tableCell :: PandocMonad m => ParserT Text ParserState m ((Bool, Alignment), Blocks) +tableCell :: PandocMonad m => TextileParser m ((Bool, Alignment), Blocks) tableCell = try $ do char '|' (isHeader, alignment) <- option (False, AlignDefault) cellAttributes @@ -349,7 +354,7 @@ tableCell = try $ do return ((isHeader, alignment), B.plain content) -- | A table row is made of many table cells -tableRow :: PandocMonad m => ParserT Text ParserState m [((Bool, Alignment), Blocks)] +tableRow :: PandocMonad m => TextileParser m [((Bool, Alignment), Blocks)] tableRow = try $ do -- skip optional row attributes optional $ try $ do @@ -359,7 +364,7 @@ tableRow = try $ do many1 tableCell <* char '|' <* blankline -- | A table with an optional header. -table :: PandocMonad m => ParserT Text ParserState m Blocks +table :: PandocMonad m => TextileParser m Blocks table = try $ do -- ignore table attributes caption <- option mempty $ try $ do @@ -375,8 +380,9 @@ table = try $ do (toprow:rest) | any (fst . fst) toprow -> (toprow, rest) _ -> (mempty, rawrows) - let nbOfCols = maximum $ map length (headers:rows) - let aligns = map minimum $ transpose $ map (map (snd . fst)) (headers:rows) + let nbOfCols = maximum $ fmap length (headers :| rows) + let aligns = map (maybe AlignDefault minimum . nonEmpty) $ + transpose $ map (map (snd . fst)) (headers:rows) let toRow = Row nullAttr . map B.simpleCell toHeaderRow l = [toRow l | not (null l)] return $ B.table (B.simpleCaption $ B.plain caption) @@ -386,7 +392,7 @@ table = try $ do (TableFoot nullAttr []) -- | Ignore markers for cols, thead, tfoot. -ignorableRow :: PandocMonad m => ParserT Text ParserState m () +ignorableRow :: PandocMonad m => TextileParser m () ignorableRow = try $ do char '|' oneOf ":^-~" @@ -395,7 +401,7 @@ ignorableRow = try $ do _ <- anyLine return () -explicitBlockStart :: PandocMonad m => Text -> ParserT Text ParserState m () +explicitBlockStart :: PandocMonad m => Text -> TextileParser m () explicitBlockStart name = try $ do string (T.unpack name) attributes @@ -407,8 +413,8 @@ explicitBlockStart name = try $ do -- However, they can be used to set HTML/CSS attributes when needed. maybeExplicitBlock :: PandocMonad m => Text -- ^ block tag name - -> ParserT Text ParserState m Blocks -- ^ implicit block - -> ParserT Text ParserState m Blocks + -> TextileParser m Blocks -- ^ implicit block + -> TextileParser m Blocks maybeExplicitBlock name blk = try $ do optional $ explicitBlockStart name blk @@ -421,11 +427,11 @@ maybeExplicitBlock name blk = try $ do -- | Any inline element -inline :: PandocMonad m => ParserT Text ParserState m Inlines +inline :: PandocMonad m => TextileParser m Inlines inline = choice inlineParsers <?> "inline" -- | Inline parsers tried in order -inlineParsers :: PandocMonad m => [ParserT Text ParserState m Inlines] +inlineParsers :: PandocMonad m => [TextileParser m Inlines] inlineParsers = [ str , whitespace , endline @@ -445,7 +451,7 @@ inlineParsers = [ str ] -- | Inline markups -inlineMarkup :: PandocMonad m => ParserT Text ParserState m Inlines +inlineMarkup :: PandocMonad m => TextileParser m Inlines inlineMarkup = choice [ simpleInline (string "??") (B.cite []) , simpleInline (string "**") B.strong , simpleInline (string "__") B.emph @@ -459,29 +465,29 @@ inlineMarkup = choice [ simpleInline (string "??") (B.cite []) ] -- | Trademark, registered, copyright -mark :: PandocMonad m => ParserT Text st m Inlines +mark :: PandocMonad m => TextileParser m Inlines mark = try $ char '(' >> (try tm <|> try reg <|> copy) -reg :: PandocMonad m => ParserT Text st m Inlines +reg :: PandocMonad m => TextileParser m Inlines reg = do oneOf "Rr" char ')' return $ B.str "\174" -tm :: PandocMonad m => ParserT Text st m Inlines +tm :: PandocMonad m => TextileParser m Inlines tm = do oneOf "Tt" oneOf "Mm" char ')' return $ B.str "\8482" -copy :: PandocMonad m => ParserT Text st m Inlines +copy :: PandocMonad m => TextileParser m Inlines copy = do oneOf "Cc" char ')' return $ B.str "\169" -note :: PandocMonad m => ParserT Text ParserState m Inlines +note :: PandocMonad m => TextileParser m Inlines note = try $ do ref <- char '[' *> many1 digit <* char ']' notes <- stateNotes <$> getState @@ -505,13 +511,13 @@ wordBoundaries :: [Char] wordBoundaries = markupChars <> stringBreakers -- | Parse a hyphened sequence of words -hyphenedWords :: PandocMonad m => ParserT Text ParserState m Text +hyphenedWords :: PandocMonad m => TextileParser m Text hyphenedWords = do x <- wordChunk xs <- many (try $ char '-' >> wordChunk) return $ T.intercalate "-" (x:xs) -wordChunk :: PandocMonad m => ParserT Text ParserState m Text +wordChunk :: PandocMonad m => TextileParser m Text wordChunk = try $ do hd <- noneOf wordBoundaries tl <- many ( noneOf wordBoundaries <|> @@ -520,7 +526,7 @@ wordChunk = try $ do return $ T.pack $ hd:tl -- | Any string -str :: PandocMonad m => ParserT Text ParserState m Inlines +str :: PandocMonad m => TextileParser m Inlines str = do baseStr <- hyphenedWords -- RedCloth compliance : if parsed word is uppercase and immediately @@ -533,11 +539,11 @@ str = do return $ B.str fullStr -- | Some number of space chars -whitespace :: PandocMonad m => ParserT Text st m Inlines +whitespace :: PandocMonad m => TextileParser m Inlines whitespace = many1 spaceChar >> return B.space <?> "whitespace" -- | In Textile, an isolated endline character is a line break -endline :: PandocMonad m => ParserT Text ParserState m Inlines +endline :: PandocMonad m => TextileParser m Inlines endline = try $ do newline notFollowedBy blankline @@ -545,18 +551,18 @@ endline = try $ do notFollowedBy rawHtmlBlock return B.linebreak -rawHtmlInline :: PandocMonad m => ParserT Text ParserState m Inlines +rawHtmlInline :: PandocMonad m => TextileParser m Inlines rawHtmlInline = B.rawInline "html" . snd <$> htmlTag isInlineTag -- | Raw LaTeX Inline -rawLaTeXInline' :: PandocMonad m => ParserT Text ParserState m Inlines +rawLaTeXInline' :: PandocMonad m => TextileParser m Inlines rawLaTeXInline' = try $ do guardEnabled Ext_raw_tex B.rawInline "latex" <$> rawLaTeXInline -- | Textile standard link syntax is "label":target. But we -- can also have ["label":target]. -link :: PandocMonad m => ParserT Text ParserState m Inlines +link :: PandocMonad m => TextileParser m Inlines link = try $ do bracketed <- (True <$ char '[') <|> return False char '"' *> notFollowedBy (oneOf " \t\n\r") @@ -576,7 +582,7 @@ link = try $ do else B.spanWith attr $ B.link url "" name' -- | image embedding -image :: PandocMonad m => ParserT Text ParserState m Inlines +image :: PandocMonad m => TextileParser m Inlines image = try $ do char '!' >> notFollowedBy space (ident, cls, kvs) <- attributes @@ -588,51 +594,51 @@ image = try $ do char '!' return $ B.imageWith attr src alt (B.str alt) -escapedInline :: PandocMonad m => ParserT Text ParserState m Inlines +escapedInline :: PandocMonad m => TextileParser m Inlines escapedInline = escapedEqs <|> escapedTag -escapedEqs :: PandocMonad m => ParserT Text ParserState m Inlines +escapedEqs :: PandocMonad m => TextileParser m Inlines escapedEqs = B.str . T.pack <$> try (string "==" *> manyTill anyChar' (try $ string "==")) -- | literal text escaped btw <notextile> tags -escapedTag :: PandocMonad m => ParserT Text ParserState m Inlines +escapedTag :: PandocMonad m => TextileParser m Inlines escapedTag = B.str . T.pack <$> try (string "<notextile>" *> manyTill anyChar' (try $ string "</notextile>")) -- | Any special symbol defined in wordBoundaries -symbol :: PandocMonad m => ParserT Text ParserState m Inlines +symbol :: PandocMonad m => TextileParser m Inlines symbol = B.str . T.singleton <$> (notFollowedBy newline *> notFollowedBy rawHtmlBlock *> oneOf wordBoundaries) -- | Inline code -code :: PandocMonad m => ParserT Text ParserState m Inlines +code :: PandocMonad m => TextileParser m Inlines code = code1 <|> code2 -- any character except a newline before a blank line -anyChar' :: PandocMonad m => ParserT Text ParserState m Char +anyChar' :: PandocMonad m => TextileParser m Char anyChar' = satisfy (/='\n') <|> try (char '\n' <* notFollowedBy blankline) -code1 :: PandocMonad m => ParserT Text ParserState m Inlines +code1 :: PandocMonad m => TextileParser m Inlines code1 = B.code . T.pack <$> surrounded (char '@') anyChar' -code2 :: PandocMonad m => ParserT Text ParserState m Inlines +code2 :: PandocMonad m => TextileParser m Inlines code2 = do htmlTag (tagOpen (=="tt") null) B.code . T.pack <$> manyTill anyChar' (try $ htmlTag $ tagClose (=="tt")) -- | Html / CSS attributes -attributes :: PandocMonad m => ParserT Text ParserState m Attr -attributes = foldl (flip ($)) ("",[],[]) <$> +attributes :: PandocMonad m => TextileParser m Attr +attributes = foldl' (flip ($)) ("",[],[]) <$> try (do special <- option id specialAttribute attrs <- many attribute return (special : attrs)) -specialAttribute :: PandocMonad m => ParserT Text ParserState m (Attr -> Attr) +specialAttribute :: PandocMonad m => TextileParser m (Attr -> Attr) specialAttribute = do alignStr <- ("center" <$ char '=') <|> ("justify" <$ try (string "<>")) <|> @@ -641,11 +647,11 @@ specialAttribute = do notFollowedBy spaceChar return $ addStyle $ T.pack $ "text-align:" ++ alignStr -attribute :: PandocMonad m => ParserT Text ParserState m (Attr -> Attr) +attribute :: PandocMonad m => TextileParser m (Attr -> Attr) attribute = try $ (classIdAttr <|> styleAttr <|> langAttr) <* notFollowedBy spaceChar -classIdAttr :: PandocMonad m => ParserT Text ParserState m (Attr -> Attr) +classIdAttr :: PandocMonad m => TextileParser m (Attr -> Attr) classIdAttr = try $ do -- (class class #id) char '(' ws <- T.words `fmap` T.pack <$> manyTill anyChar' (char ')') @@ -657,7 +663,7 @@ classIdAttr = try $ do -- (class class #id) classes' -> return $ \(_,_,keyvals) -> ("",classes',keyvals) -styleAttr :: PandocMonad m => ParserT Text ParserState m (Attr -> Attr) +styleAttr :: PandocMonad m => TextileParser m (Attr -> Attr) styleAttr = do style <- try $ enclosed (char '{') (char '}') anyChar' return $ addStyle $ T.pack style @@ -668,23 +674,23 @@ addStyle style (id',classes,keyvals) = where keyvals' = ("style", style') : [(k,v) | (k,v) <- keyvals, k /= "style"] style' = style <> ";" <> T.concat [v | ("style",v) <- keyvals] -langAttr :: PandocMonad m => ParserT Text ParserState m (Attr -> Attr) +langAttr :: PandocMonad m => TextileParser m (Attr -> Attr) langAttr = do lang <- try $ enclosed (char '[') (char ']') alphaNum return $ \(id',classes,keyvals) -> (id',classes,("lang",T.pack lang):keyvals) -- | Parses material surrounded by a parser. surrounded :: (PandocMonad m, Show t) - => ParserT Text st m t -- ^ surrounding parser - -> ParserT Text st m a -- ^ content parser (to be used repeatedly) - -> ParserT Text st m [a] + => ParserT Sources st m t -- ^ surrounding parser + -> ParserT Sources st m a -- ^ content parser (to be used repeatedly) + -> ParserT Sources st m [a] surrounded border = enclosed (border *> notFollowedBy (oneOf " \t\n\r")) (try border) simpleInline :: PandocMonad m - => ParserT Text ParserState m t -- ^ surrounding parser + => TextileParser m t -- ^ surrounding parser -> (Inlines -> Inlines) -- ^ Inline constructor - -> ParserT Text ParserState m Inlines -- ^ content parser (to be used repeatedly) + -> TextileParser m Inlines -- ^ content parser (to be used repeatedly) simpleInline border construct = try $ do notAfterString border *> notFollowedBy (oneOf " \t\n\r") @@ -698,7 +704,7 @@ simpleInline border construct = try $ do then body else B.spanWith attr body -groupedInlineMarkup :: PandocMonad m => ParserT Text ParserState m Inlines +groupedInlineMarkup :: PandocMonad m => TextileParser m Inlines groupedInlineMarkup = try $ do char '[' sp1 <- option mempty $ B.space <$ whitespace @@ -707,5 +713,5 @@ groupedInlineMarkup = try $ do char ']' return $ sp1 <> result <> sp2 -eof' :: Monad m => ParserT Text s m Char +eof' :: Monad m => ParserT Sources s m Char eof' = '\n' <$ eof diff --git a/src/Text/Pandoc/Readers/TikiWiki.hs b/src/Text/Pandoc/Readers/TikiWiki.hs index fb4b662c5..5c414fdec 100644 --- a/src/Text/Pandoc/Readers/TikiWiki.hs +++ b/src/Text/Pandoc/Readers/TikiWiki.hs @@ -30,23 +30,23 @@ import Text.Pandoc.Definition import Text.Pandoc.Logging (Verbosity (..)) import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (enclosed, nested) -import Text.Pandoc.Shared (crFilter, safeRead) +import Text.Pandoc.Shared (safeRead) import Text.Pandoc.XML (fromEntities) import Text.Printf (printf) -- | Read TikiWiki from an input string and return a Pandoc document. -readTikiWiki :: PandocMonad m +readTikiWiki :: (PandocMonad m, ToSources a) => ReaderOptions - -> Text + -> a -> m Pandoc readTikiWiki opts s = do - res <- readWithM parseTikiWiki def{ stateOptions = opts } - (crFilter s <> "\n\n") + let sources = ensureFinalNewlines 2 (toSources s) + res <- readWithM parseTikiWiki def{ stateOptions = opts } sources case res of Left e -> throwError e Right d -> return d -type TikiWikiParser = ParserT Text ParserState +type TikiWikiParser = ParserT Sources ParserState -- -- utility functions diff --git a/src/Text/Pandoc/Readers/Txt2Tags.hs b/src/Text/Pandoc/Readers/Txt2Tags.hs index 474e4fac0..b5cf5a0f3 100644 --- a/src/Text/Pandoc/Readers/Txt2Tags.hs +++ b/src/Text/Pandoc/Readers/Txt2Tags.hs @@ -1,4 +1,5 @@ {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE LambdaCase #-} {- | Module : Text.Pandoc.Readers.Txt2Tags Copyright : Copyright (C) 2014 Matthew Pickering @@ -19,6 +20,7 @@ import Control.Monad.Except (catchError, throwError) import Control.Monad.Reader (Reader, asks, runReader) import Data.Default import Data.List (intercalate, transpose) +import Data.List.NonEmpty (nonEmpty) import Data.Maybe (fromMaybe) import Data.Text (Text) import qualified Data.Text as T @@ -31,9 +33,9 @@ import Data.Time (defaultTimeLocale) import Text.Pandoc.Definition import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (space, spaces, uri) -import Text.Pandoc.Shared (compactify, compactifyDL, crFilter, escapeURI) +import Text.Pandoc.Shared (compactify, compactifyDL, escapeURI) -type T2T = ParserT Text ParserState (Reader T2TMeta) +type T2T = ParserT Sources ParserState (Reader T2TMeta) -- | An object for the T2T macros meta information -- the contents of each field is simply substituted verbatim into the file @@ -53,25 +55,28 @@ getT2TMeta = do inps <- P.getInputFiles outp <- fromMaybe "" <$> P.getOutputFile curDate <- formatTime defaultTimeLocale "%F" <$> P.getZonedTime - let getModTime = fmap (formatTime defaultTimeLocale "%T") . - P.getModificationTime - curMtime <- case inps of - [] -> formatTime defaultTimeLocale "%T" <$> P.getZonedTime - _ -> catchError - (maximum <$> mapM getModTime inps) - (const (return "")) - return $ T2TMeta (T.pack curDate) (T.pack curMtime) (intercalate ", " inps) outp + curMtime <- catchError + (mapM P.getModificationTime inps >>= + (\case + Nothing -> + formatTime defaultTimeLocale "%T" <$> P.getZonedTime + Just ts -> return $ + formatTime defaultTimeLocale "%T" $ maximum ts) + . nonEmpty) + (const (return "")) + return $ T2TMeta (T.pack curDate) (T.pack curMtime) + (intercalate ", " inps) outp -- | Read Txt2Tags from an input string returning a Pandoc document -readTxt2Tags :: PandocMonad m +readTxt2Tags :: (PandocMonad m, ToSources a) => ReaderOptions - -> Text + -> a -> m Pandoc readTxt2Tags opts s = do + let sources = ensureFinalNewlines 2 (toSources s) meta <- getT2TMeta let parsed = flip runReader meta $ - readWithM parseT2T (def {stateOptions = opts}) $ - crFilter s <> "\n\n" + readWithM parseT2T (def {stateOptions = opts}) sources case parsed of Right result -> return result Left e -> throwError e @@ -261,9 +266,9 @@ table = try $ do rows <- many1 (many commentLine *> tableRow) let columns = transpose rows let ncolumns = length columns - let aligns = map (foldr1 findAlign . map fst) columns + let aligns = map (fromMaybe AlignDefault . foldr findAlign Nothing) columns let rows' = map (map snd) rows - let size = maximum (map length rows') + let size = maybe 0 maximum $ nonEmpty $ map length rows' let rowsPadded = map (pad size) rows' let headerPadded = if null tableHeader then mempty else pad size tableHeader let toRow = Row nullAttr . map B.simpleCell @@ -278,10 +283,11 @@ pad :: (Monoid a) => Int -> [a] -> [a] pad n xs = xs ++ replicate (n - length xs) mempty -findAlign :: Alignment -> Alignment -> Alignment -findAlign x y - | x == y = x - | otherwise = AlignDefault +findAlign :: (Alignment, a) -> Maybe Alignment -> Maybe Alignment +findAlign (x,_) (Just y) + | x == y = Just x + | otherwise = Just AlignDefault +findAlign (x,_) Nothing = Just x headerRow :: T2T [(Alignment, Blocks)] headerRow = genericRow (string "||") @@ -472,9 +478,29 @@ macro = try $ do -- raw URLs in text are automatically linked url :: T2T Inlines url = try $ do - (rawUrl, escapedUrl) <- try uri <|> emailAddress + (rawUrl, escapedUrl) <- try uri <|> emailAddress' return $ B.link rawUrl "" (B.str escapedUrl) +emailAddress' :: T2T (Text, Text) +emailAddress' = do + (base, mailURI) <- emailAddress + query <- option "" emailQuery + return (base <> query, mailURI <> query) + +emailQuery :: T2T Text +emailQuery = do + char '?' + parts <- kv `sepBy1` (char '&') + return $ "?" <> T.intercalate "&" parts + +kv :: T2T Text +kv = do + k <- T.pack <$> many1 alphaNum + char '=' + let vchar = alphaNum <|> try (oneOf "%._/~:,=$@&+-;*" <* lookAhead alphaNum) + v <- T.pack <$> many1 vchar + return (k <> "=" <> v) + uri :: T2T (Text, Text) uri = try $ do address <- t2tURI @@ -564,7 +590,7 @@ getTarget = do _ -> "html" atStart :: T2T () -atStart = (sourceColumn <$> getPosition) >>= guard . (== 1) +atStart = getPosition >>= guard . (== 1) . sourceColumn ignoreSpacesCap :: T2T Text -> T2T Text ignoreSpacesCap p = T.toLower <$> (spaces *> p <* spaces) diff --git a/src/Text/Pandoc/Readers/Vimwiki.hs b/src/Text/Pandoc/Readers/Vimwiki.hs index 74dac5ea7..460f304c4 100644 --- a/src/Text/Pandoc/Readers/Vimwiki.hs +++ b/src/Text/Pandoc/Readers/Vimwiki.hs @@ -74,23 +74,28 @@ import Text.Pandoc.Parsing (ParserState, ParserT, blanklines, emailAddress, many1Till, orderedListMarker, readWithM, registerHeader, spaceChar, stateMeta, stateOptions, uri, manyTillChar, manyChar, textStr, - many1Char, countChar, many1TillChar) -import Text.Pandoc.Shared (crFilter, splitTextBy, stringify, stripFirstAndLast, + many1Char, countChar, many1TillChar, + alphaNum, anyChar, char, newline, noneOf, oneOf, + space, spaces, string) +import Text.Pandoc.Sources (ToSources(..), Sources) +import Text.Pandoc.Shared (splitTextBy, stringify, stripFirstAndLast, isURI, tshow) -import Text.Parsec.Char (alphaNum, anyChar, char, newline, noneOf, oneOf, space, - spaces, string) import Text.Parsec.Combinator (between, choice, eof, lookAhead, many1, manyTill, notFollowedBy, option, skipMany1) import Text.Parsec.Prim (getState, many, try, updateState, (<|>)) -readVimwiki :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readVimwiki :: (PandocMonad m, ToSources a) + => ReaderOptions + -> a + -> m Pandoc readVimwiki opts s = do - res <- readWithM parseVimwiki def{ stateOptions = opts } $ crFilter s + let sources = toSources s + res <- readWithM parseVimwiki def{ stateOptions = opts } sources case res of Left e -> throwError e Right result -> return result -type VwParser = ParserT Text ParserState +type VwParser = ParserT Sources ParserState -- constants |