diff options
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r-- | src/Text/Pandoc/Readers/Docx.hs | 33 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 170 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 4 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/LaTeX.hs | 2 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Markdown.hs | 11 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Org.hs | 49 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/RST.hs | 20 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/TWiki.hs | 526 |
8 files changed, 725 insertions, 90 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 4b5fbfdfc..64eb0322f 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -84,8 +84,7 @@ import Text.Pandoc.Readers.Docx.Lists import Text.Pandoc.Readers.Docx.Reducible import Text.Pandoc.Shared import Text.Pandoc.MediaBag (insertMedia, MediaBag) -import Data.Maybe (isJust) -import Data.List (delete, stripPrefix, (\\), intersect, isPrefixOf) +import Data.List (delete, (\\), intersect) import Data.Monoid import Text.TeXMath (writeTeX) import Data.Default (Default) @@ -197,19 +196,9 @@ fixAuthors mv = mv codeStyles :: [String] codeStyles = ["VerbatimChar"] -blockQuoteDivs :: [String] -blockQuoteDivs = ["Quote", "BlockQuote", "BlockQuotation"] - codeDivs :: [String] codeDivs = ["SourceCode"] - --- For the moment, we have English, Danish, German, and French. This --- is fairly ad-hoc, and there might be a more systematic way to do --- it, but it's better than nothing. -headerPrefixes :: [String] -headerPrefixes = ["Heading", "Overskrift", "berschrift", "Titre"] - runElemToInlines :: RunElem -> Inlines runElemToInlines (TextRun s) = text s runElemToInlines (LnBrk) = linebreak @@ -434,9 +423,9 @@ parStyleToTransform pPr let pPr' = pPr { pStyle = cs, indentation = Nothing} in (divWith ("", [c], [])) . (parStyleToTransform pPr') - | (c:cs) <- pStyle pPr - , c `elem` blockQuoteDivs = - let pPr' = pPr { pStyle = cs \\ blockQuoteDivs } + | (_:cs) <- pStyle pPr + , Just True <- pBlockQuote pPr = + let pPr' = pPr { pStyle = cs } in blockQuote . (parStyleToTransform pPr') | (_:cs) <- pStyle pPr = @@ -467,12 +456,11 @@ bodyPartToBlocks (Paragraph pPr parparts) $ parStyleToTransform pPr $ codeBlock $ concatMap parPartToString parparts - | (c : cs) <- filter (isJust . isHeaderClass) $ pStyle pPr - , Just (prefix, n) <- isHeaderClass c = do + | Just (style, n) <- pHeading pPr = do ils <- local (\s-> s{docxInHeaderBlock=True}) $ (concatReduce <$> mapM parPartToInlines parparts) makeHeaderAnchor $ - headerWith ("", delete (prefix ++ show n) cs, []) n ils + headerWith ("", delete style (pStyle pPr), []) n ils | otherwise = do ils <- concatReduce <$> mapM parPartToInlines parparts >>= (return . fromList . trimLineBreaks . normalizeSpaces . toList) @@ -559,12 +547,3 @@ docxToOutput :: ReaderOptions -> Docx -> (Meta, [Block], MediaBag) docxToOutput opts (Docx (Document _ body)) = let dEnv = def { docxOptions = opts} in evalDocxContext (bodyToOutput body) dEnv def - -isHeaderClass :: String -> Maybe (String, Int) -isHeaderClass s | (pref:_) <- filter (\h -> isPrefixOf h s) headerPrefixes - , Just s' <- stripPrefix pref s = - case reads s' :: [(Int, String)] of - [] -> Nothing - ((n, "") : []) -> Just (pref, n) - _ -> Nothing -isHeaderClass _ = Nothing diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 2945a1eda..5fd6b7a81 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE PatternGuards, ViewPatterns #-} +{-# LANGUAGE PatternGuards, ViewPatterns, FlexibleInstances #-} {- Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> @@ -65,7 +65,7 @@ import Text.Pandoc.Compat.Except import Text.TeXMath.Readers.OMML (readOMML) import Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..)) import Text.TeXMath (Exp) -import Data.Char (readLitChar, ord, chr) +import Data.Char (readLitChar, ord, chr, isDigit) data ReaderEnv = ReaderEnv { envNotes :: Notes , envNumbering :: Numbering @@ -73,6 +73,7 @@ data ReaderEnv = ReaderEnv { envNotes :: Notes , envMedia :: Media , envFont :: Maybe Font , envCharStyles :: CharStyleMap + , envParStyles :: ParStyleMap } deriving Show @@ -122,8 +123,12 @@ type Media = [(FilePath, B.ByteString)] type CharStyle = (String, RunStyle) +type ParStyle = (String, ParStyleData) + type CharStyleMap = M.Map String RunStyle +type ParStyleMap = M.Map String ParStyleData + data Numbering = Numbering NameSpaces [Numb] [AbstractNumb] deriving Show @@ -152,6 +157,8 @@ data ParIndentation = ParIndentation { leftParIndent :: Maybe Integer data ParagraphStyle = ParagraphStyle { pStyle :: [String] , indentation :: Maybe ParIndentation , dropCap :: Bool + , pHeading :: Maybe (String, Int) + , pBlockQuote :: Maybe Bool } deriving Show @@ -159,6 +166,8 @@ defaultParagraphStyle :: ParagraphStyle defaultParagraphStyle = ParagraphStyle { pStyle = [] , indentation = Nothing , dropCap = False + , pHeading = Nothing + , pBlockQuote = Nothing } @@ -213,6 +222,11 @@ data RunStyle = RunStyle { isBold :: Maybe Bool , rStyle :: Maybe CharStyle} deriving Show +data ParStyleData = ParStyleData { headingLev :: Maybe (String, Int) + , isBlockQuote :: Maybe Bool + , psStyle :: Maybe ParStyle} + deriving Show + defaultRunStyle :: RunStyle defaultRunStyle = RunStyle { isBold = Nothing , isItalic = Nothing @@ -242,8 +256,8 @@ archiveToDocx archive = do numbering = archiveToNumbering archive rels = archiveToRelationships archive media = archiveToMedia archive - styles = archiveToStyles archive - rEnv = ReaderEnv notes numbering rels media Nothing styles + (styles, parstyles) = archiveToStyles archive + rEnv = ReaderEnv notes numbering rels media Nothing styles parstyles doc <- runD (archiveToDocument archive) rEnv return $ Docx doc @@ -263,47 +277,69 @@ elemToBody ns element | isElem ns "w" "body" element = (\bps -> return $ Body bps) elemToBody _ _ = throwError WrongElem -archiveToStyles :: Archive -> CharStyleMap +archiveToStyles :: Archive -> (CharStyleMap, ParStyleMap) archiveToStyles zf = let stylesElem = findEntryByPath "word/styles.xml" zf >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) in case stylesElem of - Nothing -> M.empty + Nothing -> (M.empty, M.empty) Just styElem -> let namespaces = mapMaybe attrToNSPair (elAttribs styElem) in - M.fromList $ buildBasedOnList namespaces styElem Nothing + ( M.fromList $ buildBasedOnList namespaces styElem + (Nothing :: Maybe CharStyle), + M.fromList $ buildBasedOnList namespaces styElem + (Nothing :: Maybe ParStyle) ) -isBasedOnStyle :: NameSpaces -> Element -> Maybe CharStyle -> Bool +isBasedOnStyle :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> Bool isBasedOnStyle ns element parentStyle | isElem ns "w" "style" element - , Just "character" <- findAttr (elemName ns "w" "type") element + , Just styleType <- findAttr (elemName ns "w" "type") element + , styleType == cStyleType parentStyle , Just basedOnVal <- findChild (elemName ns "w" "basedOn") element >>= findAttr (elemName ns "w" "val") - , Just (parentId, _) <- parentStyle = (basedOnVal == parentId) + , Just ps <- parentStyle = (basedOnVal == getStyleId ps) | isElem ns "w" "style" element - , Just "character" <- findAttr (elemName ns "w" "type") element + , Just styleType <- findAttr (elemName ns "w" "type") element + , styleType == cStyleType parentStyle , Nothing <- findChild (elemName ns "w" "basedOn") element , Nothing <- parentStyle = True | otherwise = False -elemToCharStyle :: NameSpaces -> Element -> Maybe CharStyle -> Maybe CharStyle -elemToCharStyle ns element parentStyle - | isElem ns "w" "style" element - , Just "character" <- findAttr (elemName ns "w" "type") element - , Just styleId <- findAttr (elemName ns "w" "styleId") element = - Just (styleId, elemToRunStyle ns element parentStyle) - | otherwise = Nothing - -getStyleChildren :: NameSpaces -> Element -> Maybe CharStyle -> [CharStyle] +class ElemToStyle a where + cStyleType :: Maybe a -> String + elemToStyle :: NameSpaces -> Element -> Maybe a -> Maybe a + getStyleId :: a -> String + +instance ElemToStyle CharStyle where + cStyleType _ = "character" + elemToStyle ns element parentStyle + | isElem ns "w" "style" element + , Just "character" <- findAttr (elemName ns "w" "type") element + , Just styleId <- findAttr (elemName ns "w" "styleId") element = + Just (styleId, elemToRunStyle ns element parentStyle) + | otherwise = Nothing + getStyleId s = fst s + +instance ElemToStyle ParStyle where + cStyleType _ = "paragraph" + elemToStyle ns element parentStyle + | isElem ns "w" "style" element + , Just "paragraph" <- findAttr (elemName ns "w" "type") element + , Just styleId <- findAttr (elemName ns "w" "styleId") element = + Just (styleId, elemToParStyleData ns element parentStyle) + | otherwise = Nothing + getStyleId s = fst s + +getStyleChildren :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> [a] getStyleChildren ns element parentStyle | isElem ns "w" "styles" element = - mapMaybe (\e -> elemToCharStyle ns e parentStyle) $ + mapMaybe (\e -> elemToStyle ns e parentStyle) $ filterChildren (\e' -> isBasedOnStyle ns e' parentStyle) element | otherwise = [] -buildBasedOnList :: NameSpaces -> Element -> Maybe CharStyle -> [CharStyle] +buildBasedOnList :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> [a] buildBasedOnList ns element rootStyle = case (getStyleChildren ns element rootStyle) of [] -> [] @@ -543,7 +579,8 @@ elemToBodyPart ns element elemToBodyPart ns element | isElem ns "w" "p" element , Just (numId, lvl) <- elemToNumInfo ns element = do - let parstyle = elemToParagraphStyle ns element + sty <- asks envParStyles + let parstyle = elemToParagraphStyle ns element sty parparts <- mapD (elemToParPart ns) (elChildren element) num <- asks envNumbering case lookupLevel numId lvl num of @@ -551,7 +588,8 @@ elemToBodyPart ns element Nothing -> throwError WrongElem elemToBodyPart ns element | isElem ns "w" "p" element = do - let parstyle = elemToParagraphStyle ns element + sty <- asks envParStyles + let parstyle = elemToParagraphStyle ns element sty parparts <- mapD (elemToParPart ns) (elChildren element) return $ Paragraph parstyle parparts elemToBodyPart ns element @@ -625,17 +663,20 @@ elemToParPart ns element return $ BookMark bmId bmName elemToParPart ns element | isElem ns "w" "hyperlink" element - , Just anchor <- findAttr (elemName ns "w" "anchor") element = do + , Just relId <- findAttr (elemName ns "r" "id") element = do runs <- mapD (elemToRun ns) (elChildren element) - return $ InternalHyperLink anchor runs + rels <- asks envRelationships + case lookupRelationship relId rels of + Just target -> do + case findAttr (elemName ns "w" "anchor") element of + Just anchor -> return $ ExternalHyperLink (target ++ '#':anchor) runs + Nothing -> return $ ExternalHyperLink target runs + Nothing -> return $ ExternalHyperLink "" runs elemToParPart ns element | isElem ns "w" "hyperlink" element - , Just relId <- findAttr (elemName ns "r" "id") element = do + , Just anchor <- findAttr (elemName ns "w" "anchor") element = do runs <- mapD (elemToRun ns) (elChildren element) - rels <- asks envRelationships - return $ case lookupRelationship relId rels of - Just target -> ExternalHyperLink target runs - Nothing -> ExternalHyperLink "" runs + return $ InternalHyperLink anchor runs elemToParPart ns element | isElem ns "m" "oMath" element = (eitherToD $ readOMML $ showElement element) >>= (return . PlainOMath) @@ -684,14 +725,30 @@ elemToRun ns element return $ Run runStyle runElems elemToRun _ _ = throwError WrongElem -elemToParagraphStyle :: NameSpaces -> Element -> ParagraphStyle -elemToParagraphStyle ns element +getParentStyleValue :: (ParStyleData -> Maybe a) -> ParStyleData -> Maybe a +getParentStyleValue field style + | Just value <- field style = Just value + | Just parentStyle <- psStyle style + = getParentStyleValue field (snd parentStyle) +getParentStyleValue _ _ = Nothing + +getParStyleField :: (ParStyleData -> Maybe a) -> ParStyleMap -> [String] -> + Maybe a +getParStyleField field stylemap styles + | x <- mapMaybe (\x -> M.lookup x stylemap) styles + , (y:_) <- mapMaybe (getParentStyleValue field) x + = Just y +getParStyleField _ _ _ = Nothing + +elemToParagraphStyle :: NameSpaces -> Element -> ParStyleMap -> ParagraphStyle +elemToParagraphStyle ns element sty | Just pPr <- findChild (elemName ns "w" "pPr") element = - ParagraphStyle - {pStyle = + let style = mapMaybe (findAttr (elemName ns "w" "val")) (findChildren (elemName ns "w" "pStyle") pPr) + in ParagraphStyle + {pStyle = style , indentation = findChild (elemName ns "w" "ind") pPr >>= elemToParIndentation ns @@ -703,8 +760,10 @@ elemToParagraphStyle ns element Just "none" -> False Just _ -> True Nothing -> False + , pHeading = getParStyleField headingLev sty style + , pBlockQuote = getParStyleField isBlockQuote sty style } -elemToParagraphStyle _ _ = defaultParagraphStyle +elemToParagraphStyle _ _ _ = defaultParagraphStyle checkOnOff :: NameSpaces -> Element -> QName -> Maybe Bool checkOnOff ns rPr tag @@ -758,6 +817,45 @@ elemToRunStyle ns element parentStyle } elemToRunStyle _ _ _ = defaultRunStyle +isNumericNotNull :: String -> Bool +isNumericNotNull str = (str /= []) && (all isDigit str) + +getHeaderLevel :: NameSpaces -> Element -> Maybe (String,Int) +getHeaderLevel ns element + | Just styleId <- findAttr (elemName ns "w" "styleId") element + , Just index <- stripPrefix "Heading" styleId + , isNumericNotNull index = Just (styleId, read index) + | Just styleId <- findAttr (elemName ns "w" "styleId") element + , Just index <- findChild (elemName ns "w" "name") element >>= + findAttr (elemName ns "w" "val") >>= + stripPrefix "heading " + , isNumericNotNull index = Just (styleId, read index) +getHeaderLevel _ _ = Nothing + +blockQuoteStyleIds :: [String] +blockQuoteStyleIds = ["Quote", "BlockQuote", "BlockQuotation"] + +blockQuoteStyleNames :: [String] +blockQuoteStyleNames = ["Quote", "Block Text"] + +getBlockQuote :: NameSpaces -> Element -> Maybe Bool +getBlockQuote ns element + | Just styleId <- findAttr (elemName ns "w" "styleId") element + , styleId `elem` blockQuoteStyleIds = Just True + | Just styleName <- findChild (elemName ns "w" "name") element >>= + findAttr (elemName ns "w" "val") + , styleName `elem` blockQuoteStyleNames = Just True +getBlockQuote _ _ = Nothing + +elemToParStyleData :: NameSpaces -> Element -> Maybe ParStyle -> ParStyleData +elemToParStyleData ns element parentStyle = + ParStyleData + { + headingLev = getHeaderLevel ns element + , isBlockQuote = getBlockQuote ns element + , psStyle = parentStyle + } + elemToRunElem :: NameSpaces -> Element -> D RunElem elemToRunElem ns element | isElem ns "w" "t" element diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 4e0bb375a..2a23f2a62 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -740,7 +740,7 @@ pSpace = many1 (satisfy isSpace) >> return B.space -- eitherBlockOrInline :: [String] -eitherBlockOrInline = ["audio", "applet", "button", "iframe", +eitherBlockOrInline = ["audio", "applet", "button", "iframe", "embed", "del", "ins", "progress", "map", "area", "noscript", "script", "object", "svg", "video", "source"] @@ -758,7 +758,7 @@ blockHtmlTags :: [String] blockHtmlTags = ["?xml", "!DOCTYPE", "address", "article", "aside", "blockquote", "body", "button", "canvas", "caption", "center", "col", "colgroup", "dd", "dir", "div", - "dl", "dt", "embed", "fieldset", "figcaption", "figure", + "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "isindex", "menu", "noframes", "ol", "output", "p", "pre", diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index 9f51e9a8f..9420d602f 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -494,6 +494,7 @@ inlineCommands = M.fromList $ , ("citealp", citation "citealp" NormalCitation False) , ("citealp*", citation "citealp*" NormalCitation False) , ("autocite", citation "autocite" NormalCitation False) + , ("smartcite", citation "smartcite" NormalCitation False) , ("footcite", inNote <$> citation "footcite" NormalCitation False) , ("parencite", citation "parencite" NormalCitation False) , ("supercite", citation "supercite" NormalCitation False) @@ -516,6 +517,7 @@ inlineCommands = M.fromList $ , ("supercites", citation "supercites" NormalCitation True) , ("footcitetexts", inNote <$> citation "footcitetexts" NormalCitation True) , ("Autocite", citation "Autocite" NormalCitation False) + , ("Smartcite", citation "Smartcite" NormalCitation False) , ("Footcite", citation "Footcite" NormalCitation False) , ("Parencite", citation "Parencite" NormalCitation False) , ("Supercite", citation "Supercite" NormalCitation False) diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 7a3be8291..b8487b4e6 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -117,6 +117,12 @@ isBlank _ = False -- auxiliary functions -- +-- | Succeeds when we're in list context. +inList :: MarkdownParser () +inList = do + ctx <- stateParserContext <$> getState + guard (ctx == ListItemState) + isNull :: F Inlines -> Bool isNull ils = B.isNull $ runF ils def @@ -926,6 +932,8 @@ para = try $ do <|> (guardEnabled Ext_backtick_code_blocks >> () <$ lookAhead codeBlockFenced) <|> (guardDisabled Ext_blank_before_header >> () <$ lookAhead header) <|> (guardEnabled Ext_lists_without_preceding_blankline >> + -- Avoid creating a paragraph in a nested list. + notFollowedBy' inList >> () <$ lookAhead listStart) <|> do guardEnabled Ext_native_divs inHtmlBlock <- stateInHtmlBlock <$> getState @@ -1610,8 +1618,7 @@ endline = try $ do newline notFollowedBy blankline -- parse potential list-starts differently if in a list: - st <- getState - when (stateParserContext st == ListItemState) $ notFollowedBy listStart + notFollowedBy (inList >> listStart) guardDisabled Ext_lists_without_preceding_blankline <|> notFollowedBy listStart guardEnabled Ext_blank_before_blockquote <|> notFollowedBy emailBlockQuoteStart guardEnabled Ext_blank_before_header <|> notFollowedBy (char '#') -- atx header diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs index 1ddfeab87..579e38a38 100644 --- a/src/Text/Pandoc/Readers/Org.hs +++ b/src/Text/Pandoc/Readers/Org.hs @@ -879,18 +879,18 @@ bulletListStart = bulletListStart' Nothing bulletListStart' :: Maybe Int -> OrgParser Int -- returns length of bulletList prefix, inclusive of marker -bulletListStart' Nothing = do ind <- many spaceChar +bulletListStart' Nothing = do ind <- length <$> many spaceChar + when (ind == 0) $ notFollowedBy (char '*') oneOf bullets many1 spaceChar - return $ length ind + 1 + return (ind + 1) -- Unindented lists are legal, but they can't use '*' bullets -- We return n to maintain compatibility with the generic listItem bulletListStart' (Just n) = do count (n-1) spaceChar - oneOf validBullets + when (n == 1) $ notFollowedBy (char '*') + oneOf bullets many1 spaceChar return n - where validBullets = if n == 1 then noAsterisks else bullets - noAsterisks = filter (/= '*') bullets bullets :: String bullets = "*+-" @@ -1099,7 +1099,7 @@ linkOrImage = explicitOrImageLink explicitOrImageLink :: OrgParser (F Inlines) explicitOrImageLink = try $ do char '[' - srcF <- applyCustomLinkFormat =<< linkTarget + srcF <- applyCustomLinkFormat =<< possiblyEmptyLinkTarget title <- enclosedRaw (char '[') (char ']') title' <- parseFromString (mconcat <$> many inline) title char ']' @@ -1132,6 +1132,9 @@ selfTarget = try $ char '[' *> linkTarget <* char ']' linkTarget :: OrgParser String linkTarget = enclosedByPair '[' ']' (noneOf "\n\r[]") +possiblyEmptyLinkTarget :: OrgParser String +possiblyEmptyLinkTarget = try linkTarget <|> ("" <$ string "[]") + applyCustomLinkFormat :: String -> OrgParser (F String) applyCustomLinkFormat link = do let (linkType, rest) = break (== ':') link @@ -1139,27 +1142,33 @@ applyCustomLinkFormat link = do formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters return $ maybe link ($ drop 1 rest) formatter - linkToInlinesF :: String -> Inlines -> F Inlines -linkToInlinesF s@('#':_) = pure . B.link s "" -linkToInlinesF s - | isImageFilename s = const . pure $ B.image s "" "" - | isUri s = pure . B.link s "" - | isRelativeUrl s = pure . B.link s "" -linkToInlinesF s = \title -> do - anchorB <- (s `elem`) <$> asksF orgStateAnchorIds - if anchorB - then pure $ B.link ('#':s) "" title - else pure $ B.emph title - -isRelativeUrl :: String -> Bool -isRelativeUrl s = (':' `notElem` s) && ("./" `isPrefixOf` s) +linkToInlinesF s = + case s of + "" -> pure . B.link "" "" + ('#':_) -> pure . B.link s "" + _ | isImageFilename s -> const . pure $ B.image s "" "" + _ | isUri s -> pure . B.link s "" + _ | isRelativeFilePath s -> pure . B.link s "" + _ | isAbsoluteFilePath s -> pure . B.link ("file://" ++ s) "" + _ -> \title -> do + anchorB <- (s `elem`) <$> asksF orgStateAnchorIds + if anchorB + then pure $ B.link ('#':s) "" title + else pure $ B.emph title + +isRelativeFilePath :: String -> Bool +isRelativeFilePath s = (("./" `isPrefixOf` s) || ("../" `isPrefixOf` s)) && + (':' `notElem` s) isUri :: String -> Bool isUri s = let (scheme, path) = break (== ':') s in all (\c -> isAlphaNum c || c `elem` ".-") scheme && not (null path) +isAbsoluteFilePath :: String -> Bool +isAbsoluteFilePath = ('/' ==) . head + isImageFilename :: String -> Bool isImageFilename filename = any (\x -> ('.':x) `isSuffixOf` filename) imageExtensions && diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs index e5eccb116..732956981 100644 --- a/src/Text/Pandoc/Readers/RST.hs +++ b/src/Text/Pandoc/Readers/RST.hs @@ -47,7 +47,7 @@ import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>)) import qualified Text.Pandoc.Builder as B import Data.Monoid (mconcat, mempty) import Data.Sequence (viewr, ViewR(..)) -import Data.Char (toLower, isHexDigit) +import Data.Char (toLower, isHexDigit, isSpace) -- | Parse reStructuredText string and return Pandoc document. readRST :: ReaderOptions -- ^ Reader options @@ -335,6 +335,13 @@ indentedBlock = try $ do optional blanklines return $ unlines lns +quotedBlock :: Parser [Char] st [Char] +quotedBlock = try $ do + quote <- lookAhead $ oneOf "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" + lns <- many1 $ lookAhead (char quote) >> anyLine + optional blanklines + return $ unlines lns + codeBlockStart :: Parser [Char] st Char codeBlockStart = string "::" >> blankline >> blankline @@ -342,7 +349,8 @@ codeBlock :: Parser [Char] st Blocks codeBlock = try $ codeBlockStart >> codeBlockBody codeBlockBody :: Parser [Char] st Blocks -codeBlockBody = try $ B.codeBlock . stripTrailingNewlines <$> indentedBlock +codeBlockBody = try $ B.codeBlock . stripTrailingNewlines <$> + (indentedBlock <|> quotedBlock) lhsCodeBlock :: RSTParser Blocks lhsCodeBlock = try $ do @@ -513,7 +521,6 @@ directive = try $ do -- TODO: line-block, parsed-literal, table, csv-table, list-table -- date -- include --- class -- title directive' :: RSTParser Blocks directive' = do @@ -594,6 +601,13 @@ directive' = do Just t -> B.link (escapeURI $ trim t) "" $ B.image src "" alt Nothing -> B.image src "" alt + "class" -> do + let attrs = ("", (splitBy isSpace $ trim top), map (\(k,v) -> (k, trimr v)) fields) + -- directive content or the first immediately following element + children <- case body of + "" -> block + _ -> parseFromString parseBlocks body' + return $ B.divWith attrs children _ -> return mempty -- TODO: diff --git a/src/Text/Pandoc/Readers/TWiki.hs b/src/Text/Pandoc/Readers/TWiki.hs new file mode 100644 index 000000000..c2325c0ea --- /dev/null +++ b/src/Text/Pandoc/Readers/TWiki.hs @@ -0,0 +1,526 @@ +{-# LANGUAGE RelaxedPolyRec, FlexibleInstances, TypeSynonymInstances #-} +-- RelaxedPolyRec needed for inlinesBetween on GHC < 7 +{- + Copyright (C) 2014 Alexander Sulfrian <alexander.sulfrian@fu-berlin.de> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.TWiki + Copyright : Copyright (C) 2014 Alexander Sulfrian + License : GNU GPL, version 2 or above + + Maintainer : Alexander Sulfrian <alexander.sulfrian@fu-berlin.de> + Stability : alpha + Portability : portable + +Conversion of twiki text to 'Pandoc' document. +-} +module Text.Pandoc.Readers.TWiki ( readTWiki + , readTWikiWithWarnings + ) where + +import Text.Pandoc.Definition +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Options +import Text.Pandoc.Parsing hiding (enclosed, macro, nested) +import Text.Pandoc.Readers.HTML (htmlTag, isCommentTag) +import Data.Monoid (Monoid, mconcat, mempty) +import Control.Applicative ((<$>), (<*), (*>), (<$)) +import Control.Monad +import Text.Printf (printf) +import Debug.Trace (trace) +import Text.Pandoc.XML (fromEntities) +import Data.Maybe (fromMaybe) +import Text.HTML.TagSoup +import Data.Char (isAlphaNum) +import qualified Data.Foldable as F + +-- | Read twiki from an input string and return a Pandoc document. +readTWiki :: ReaderOptions -- ^ Reader options + -> String -- ^ String to parse (assuming @'\n'@ line endings) + -> Pandoc +readTWiki opts s = + (readWith parseTWiki) def{ stateOptions = opts } (s ++ "\n\n") + +readTWikiWithWarnings :: ReaderOptions -- ^ Reader options + -> String -- ^ String to parse (assuming @'\n'@ line endings) + -> (Pandoc, [String]) +readTWikiWithWarnings opts s = + (readWith parseTWikiWithWarnings) def{ stateOptions = opts } (s ++ "\n\n") + where parseTWikiWithWarnings = do + doc <- parseTWiki + warnings <- stateWarnings <$> getState + return (doc, warnings) + +type TWParser = Parser [Char] ParserState + +-- +-- utility functions +-- + +tryMsg :: String -> TWParser a -> TWParser a +tryMsg msg p = try p <?> msg + +skip :: TWParser a -> TWParser () +skip parser = parser >> return () + +nested :: TWParser a -> TWParser a +nested p = do + nestlevel <- stateMaxNestingLevel <$> getState + guard $ nestlevel > 0 + updateState $ \st -> st{ stateMaxNestingLevel = stateMaxNestingLevel st - 1 } + res <- p + updateState $ \st -> st{ stateMaxNestingLevel = nestlevel } + return res + +htmlElement :: String -> TWParser (Attr, String) +htmlElement tag = tryMsg tag $ do + (TagOpen _ attr, _) <- htmlTag (~== TagOpen tag []) + content <- manyTill anyChar (endtag <|> endofinput) + return (htmlAttrToPandoc attr, trim content) + where + endtag = skip $ htmlTag (~== TagClose tag) + endofinput = lookAhead $ try $ skipMany blankline >> skipSpaces >> eof + trim = dropWhile (=='\n') . reverse . dropWhile (=='\n') . reverse + +htmlAttrToPandoc :: [Attribute String] -> Attr +htmlAttrToPandoc attrs = (ident, classes, keyvals) + where + ident = fromMaybe "" $ lookup "id" attrs + classes = maybe [] words $ lookup "class" attrs + keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] + +parseHtmlContentWithAttrs :: String -> TWParser a -> TWParser (Attr, [a]) +parseHtmlContentWithAttrs tag parser = do + (attr, content) <- htmlElement tag + parsedContent <- try $ parseContent content + return (attr, parsedContent) + where + parseContent = parseFromString $ nested $ manyTill parser endOfContent + endOfContent = try $ skipMany blankline >> skipSpaces >> eof + +parseHtmlContent :: String -> TWParser a -> TWParser [a] +parseHtmlContent tag p = parseHtmlContentWithAttrs tag p >>= return . snd + +-- +-- main parser +-- + +parseTWiki :: TWParser Pandoc +parseTWiki = do + bs <- mconcat <$> many block + spaces + eof + return $ B.doc bs + + +-- +-- block parsers +-- + +block :: TWParser B.Blocks +block = do + tr <- getOption readerTrace + pos <- getPosition + res <- mempty <$ skipMany1 blankline + <|> blockElements + <|> para + skipMany blankline + when tr $ + trace (printf "line %d: %s" (sourceLine pos) + (take 60 $ show $ B.toList res)) (return ()) + return res + +blockElements :: TWParser B.Blocks +blockElements = choice [ separator + , header + , verbatim + , literal + , list "" + , table + , blockQuote + , noautolink + ] + +separator :: TWParser B.Blocks +separator = tryMsg "separator" $ string "---" >> newline >> return B.horizontalRule + +header :: TWParser B.Blocks +header = tryMsg "header" $ do + string "---" + level <- many1 (char '+') >>= return . length + guard $ level <= 6 + classes <- option [] $ string "!!" >> return ["unnumbered"] + skipSpaces + content <- B.trimInlines . mconcat <$> manyTill inline newline + attr <- registerHeader ("", classes, []) content + return $ B.headerWith attr level $ content + +verbatim :: TWParser B.Blocks +verbatim = (htmlElement "verbatim" <|> htmlElement "pre") + >>= return . (uncurry B.codeBlockWith) + +literal :: TWParser B.Blocks +literal = htmlElement "literal" >>= return . rawBlock + where + format (_, _, kvs) = fromMaybe "html" $ lookup "format" kvs + rawBlock (attrs, content) = B.rawBlock (format attrs) content + +list :: String -> TWParser B.Blocks +list prefix = choice [ bulletList prefix + , orderedList prefix + , definitionList prefix] + +definitionList :: String -> TWParser B.Blocks +definitionList prefix = tryMsg "definitionList" $ do + indent <- lookAhead $ string prefix *> (many1 $ string " ") <* string "$ " + elements <- many $ parseDefinitionListItem (prefix ++ concat indent) + return $ B.definitionList elements + where + parseDefinitionListItem :: String -> TWParser (B.Inlines, [B.Blocks]) + parseDefinitionListItem indent = do + string (indent ++ "$ ") >> skipSpaces + term <- many1Till inline $ string ": " + line <- listItemLine indent $ string "$ " + return $ (mconcat term, [line]) + +bulletList :: String -> TWParser B.Blocks +bulletList prefix = tryMsg "bulletList" $ + parseList prefix (char '*') (char ' ') + +orderedList :: String -> TWParser B.Blocks +orderedList prefix = tryMsg "orderedList" $ + parseList prefix (oneOf "1iIaA") (string ". ") + +parseList :: Show a => String -> TWParser Char -> TWParser a -> TWParser B.Blocks +parseList prefix marker delim = do + (indent, style) <- lookAhead $ string prefix *> listStyle <* delim + blocks <- many $ parseListItem (prefix ++ indent) (char style <* delim) + return $ case style of + '1' -> B.orderedListWith (1, DefaultStyle, DefaultDelim) blocks + 'i' -> B.orderedListWith (1, LowerRoman, DefaultDelim) blocks + 'I' -> B.orderedListWith (1, UpperRoman, DefaultDelim) blocks + 'a' -> B.orderedListWith (1, LowerAlpha, DefaultDelim) blocks + 'A' -> B.orderedListWith (1, UpperAlpha, DefaultDelim) blocks + _ -> B.bulletList blocks + where + listStyle = do + indent <- many1 $ string " " + style <- marker + return (concat indent, style) + +parseListItem :: Show a => String -> TWParser a -> TWParser B.Blocks +parseListItem prefix marker = string prefix >> marker >> listItemLine prefix marker + +listItemLine :: Show a => String -> TWParser a -> TWParser B.Blocks +listItemLine prefix marker = lineContent >>= parseContent >>= return . mconcat + where + lineContent = do + content <- anyLine + continuation <- optionMaybe listContinuation + return $ filterSpaces content ++ "\n" ++ (maybe "" (" " ++) continuation) + filterSpaces = reverse . dropWhile (== ' ') . reverse + listContinuation = notFollowedBy (string prefix >> marker) >> + string " " >> lineContent + parseContent = parseFromString $ many1 $ nestedList <|> parseInline + parseInline = many1Till inline (lastNewline <|> newlineBeforeNestedList) >>= + return . B.plain . mconcat + nestedList = list prefix + lastNewline = try $ char '\n' <* eof + newlineBeforeNestedList = try $ char '\n' <* lookAhead nestedList + +table :: TWParser B.Blocks +table = try $ do + tableHead <- optionMaybe $ many1Till tableParseHeader newline >>= return . unzip + rows <- many1 tableParseRow + return $ buildTable mempty rows $ fromMaybe (align rows, columns rows) tableHead + where + buildTable caption rows (aligns, heads) + = B.table caption aligns heads rows + align rows = replicate (columCount rows) (AlignDefault, 0) + columns rows = replicate (columCount rows) mempty + columCount rows = length $ head rows + +tableParseHeader :: TWParser ((Alignment, Double), B.Blocks) +tableParseHeader = try $ do + char '|' + leftSpaces <- many spaceChar >>= return . length + char '*' + content <- tableColumnContent (char '*' >> skipSpaces >> char '|') + char '*' + rightSpaces <- many spaceChar >>= return . length + optional tableEndOfRow + return (tableAlign leftSpaces rightSpaces, content) + where + tableAlign left right + | left >= 2 && left == right = (AlignCenter, 0) + | left > right = (AlignRight, 0) + | otherwise = (AlignLeft, 0) + +tableParseRow :: TWParser [B.Blocks] +tableParseRow = many1Till tableParseColumn newline + +tableParseColumn :: TWParser B.Blocks +tableParseColumn = char '|' *> skipSpaces *> + tableColumnContent (skipSpaces >> char '|') + <* skipSpaces <* optional tableEndOfRow + +tableEndOfRow :: TWParser Char +tableEndOfRow = lookAhead (try $ char '|' >> char '\n') >> char '|' + +tableColumnContent :: Show a => TWParser a -> TWParser B.Blocks +tableColumnContent end = manyTill content (lookAhead $ try end) >>= return . B.plain . mconcat + where + content = continuation <|> inline + continuation = try $ char '\\' >> newline >> return mempty + +blockQuote :: TWParser B.Blocks +blockQuote = parseHtmlContent "blockquote" block >>= return . B.blockQuote . mconcat + +noautolink :: TWParser B.Blocks +noautolink = do + (_, content) <- htmlElement "noautolink" + st <- getState + setState $ st{ stateAllowLinks = False } + blocks <- try $ parseContent content + setState $ st{ stateAllowLinks = True } + return $ mconcat blocks + where + parseContent = parseFromString $ many $ block + +para :: TWParser B.Blocks +para = many1Till inline endOfParaElement >>= return . result . mconcat + where + endOfParaElement = lookAhead $ endOfInput <|> endOfPara <|> newBlockElement + endOfInput = try $ skipMany blankline >> skipSpaces >> eof + endOfPara = try $ blankline >> skipMany1 blankline + newBlockElement = try $ blankline >> skip blockElements + result content = if F.all (==Space) content + then mempty + else B.para $ B.trimInlines content + + +-- +-- inline parsers +-- + +inline :: TWParser B.Inlines +inline = choice [ whitespace + , br + , macro + , strong + , strongHtml + , strongAndEmph + , emph + , emphHtml + , boldCode + , smart + , link + , htmlComment + , code + , codeHtml + , nop + , autoLink + , str + , symbol + ] <?> "inline" + +whitespace :: TWParser B.Inlines +whitespace = (lb <|> regsp) >>= return + where lb = try $ skipMany spaceChar >> linebreak >> return B.space + regsp = try $ skipMany1 spaceChar >> return B.space + +br :: TWParser B.Inlines +br = try $ string "%BR%" >> return B.linebreak + +linebreak :: TWParser B.Inlines +linebreak = newline >> notFollowedBy newline >> (lastNewline <|> innerNewline) + where lastNewline = eof >> return mempty + innerNewline = return B.space + +between :: (Show b, Monoid c) => TWParser a -> TWParser b -> (TWParser b -> TWParser c) -> TWParser c +between start end p = + mconcat <$> try (start >> notFollowedBy whitespace >> many1Till (p end) end) + +enclosed :: (Show a, Monoid b) => TWParser a -> (TWParser a -> TWParser b) -> TWParser b +enclosed sep p = between sep (try $ sep <* endMarker) p + where + endMarker = lookAhead $ skip endSpace <|> skip (oneOf ".,!?:)|") <|> eof + endSpace = (spaceChar <|> newline) >> return B.space + +macro :: TWParser B.Inlines +macro = macroWithParameters <|> withoutParameters + where + withoutParameters = enclosed (char '%') (\_ -> macroName) >>= return . emptySpan + emptySpan name = buildSpan name [] mempty + +macroWithParameters :: TWParser B.Inlines +macroWithParameters = try $ do + char '%' + name <- macroName + (content, kvs) <- attributes + char '%' + return $ buildSpan name kvs $ B.str content + +buildSpan :: String -> [(String, String)] -> B.Inlines -> B.Inlines +buildSpan className kvs = B.spanWith attrs + where + attrs = ("", ["twiki-macro", className] ++ additionalClasses, kvsWithoutClasses) + additionalClasses = maybe [] words $ lookup "class" kvs + kvsWithoutClasses = [(k,v) | (k,v) <- kvs, k /= "class"] + +macroName :: TWParser String +macroName = do + first <- letter + rest <- many $ alphaNum <|> char '_' + return (first:rest) + +attributes :: TWParser (String, [(String, String)]) +attributes = char '{' *> spnl *> many (attribute <* spnl) <* char '}' >>= + return . foldr (either mkContent mkKvs) ([], []) + where + spnl = skipMany (spaceChar <|> newline) + mkContent c ([], kvs) = (c, kvs) + mkContent c (rest, kvs) = (c ++ " " ++ rest, kvs) + mkKvs kv (cont, rest) = (cont, (kv : rest)) + +attribute :: TWParser (Either String (String, String)) +attribute = withKey <|> withoutKey + where + withKey = try $ do + key <- macroName + char '=' + parseValue False >>= return . (curry Right key) + withoutKey = try $ parseValue True >>= return . Left + parseValue allowSpaces = (withQuotes <|> withoutQuotes allowSpaces) >>= return . fromEntities + withQuotes = between (char '"') (char '"') (\_ -> count 1 $ noneOf ['"']) + withoutQuotes allowSpaces + | allowSpaces == True = many1 $ noneOf "}" + | otherwise = many1 $ noneOf " }" + +nestedInlines :: Show a => TWParser a -> TWParser B.Inlines +nestedInlines end = innerSpace <|> nestedInline + where + innerSpace = try $ whitespace <* (notFollowedBy end) + nestedInline = notFollowedBy whitespace >> nested inline + +strong :: TWParser B.Inlines +strong = try $ enclosed (char '*') nestedInlines >>= return . B.strong + +strongHtml :: TWParser B.Inlines +strongHtml = (parseHtmlContent "strong" inline <|> parseHtmlContent "b" inline) + >>= return . B.strong . mconcat + +strongAndEmph :: TWParser B.Inlines +strongAndEmph = try $ enclosed (string "__") nestedInlines >>= return . B.emph . B.strong + +emph :: TWParser B.Inlines +emph = try $ enclosed (char '_') nestedInlines >>= return . B.emph + +emphHtml :: TWParser B.Inlines +emphHtml = (parseHtmlContent "em" inline <|> parseHtmlContent "i" inline) + >>= return . B.emph . mconcat + +nestedString :: Show a => TWParser a -> TWParser String +nestedString end = innerSpace <|> (count 1 nonspaceChar) + where + innerSpace = try $ many1 spaceChar <* notFollowedBy end + +boldCode :: TWParser B.Inlines +boldCode = try $ enclosed (string "==") nestedString >>= return . B.strong . B.code . fromEntities + +htmlComment :: TWParser B.Inlines +htmlComment = htmlTag isCommentTag >> return mempty + +code :: TWParser B.Inlines +code = try $ enclosed (char '=') nestedString >>= return . B.code . fromEntities + +codeHtml :: TWParser B.Inlines +codeHtml = do + (attrs, content) <- parseHtmlContentWithAttrs "code" anyChar + return $ B.codeWith attrs $ fromEntities content + +autoLink :: TWParser B.Inlines +autoLink = try $ do + state <- getState + guard $ stateAllowLinks state + (text, url) <- parseLink + guard $ checkLink (head $ reverse url) + return $ makeLink (text, url) + where + parseLink = notFollowedBy nop >> (uri <|> emailAddress) + makeLink (text, url) = B.link url "" $ B.str text + checkLink c + | c == '/' = True + | otherwise = isAlphaNum c + +str :: TWParser B.Inlines +str = (many1 alphaNum <|> count 1 characterReference) >>= return . B.str + +nop :: TWParser B.Inlines +nop = try $ (skip exclamation <|> skip nopTag) >> followContent + where + exclamation = char '!' + nopTag = stringAnyCase "<nop>" + followContent = many1 nonspaceChar >>= return . B.str . fromEntities + +symbol :: TWParser B.Inlines +symbol = count 1 nonspaceChar >>= return . B.str + +smart :: TWParser B.Inlines +smart = do + getOption readerSmart >>= guard + doubleQuoted <|> singleQuoted <|> + choice [ apostrophe + , dash + , ellipses + ] + +singleQuoted :: TWParser B.Inlines +singleQuoted = try $ do + singleQuoteStart + withQuoteContext InSingleQuote $ + many1Till inline singleQuoteEnd >>= + (return . B.singleQuoted . B.trimInlines . mconcat) + +doubleQuoted :: TWParser B.Inlines +doubleQuoted = try $ do + doubleQuoteStart + contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline) + (withQuoteContext InDoubleQuote $ doubleQuoteEnd >> + return (B.doubleQuoted $ B.trimInlines contents)) + <|> (return $ (B.str "\8220") B.<> contents) + +link :: TWParser B.Inlines +link = try $ do + st <- getState + guard $ stateAllowLinks st + setState $ st{ stateAllowLinks = False } + (url, title, content) <- linkText + setState $ st{ stateAllowLinks = True } + return $ B.link url title content + +linkText :: TWParser (String, String, B.Inlines) +linkText = do + string "[[" + url <- many1Till anyChar (char ']') + content <- option [B.str url] linkContent + char ']' + return (url, "", mconcat content) + where + linkContent = (char '[') >> many1Till anyChar (char ']') >>= parseLinkContent + parseLinkContent = parseFromString $ many1 inline |