diff options
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 4 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 25 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Markdown.hs | 42 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Org.hs | 24 |
4 files changed, 60 insertions, 35 deletions
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 364483929..7265ef8dd 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -661,14 +661,14 @@ elemToParPart ns element | isElem ns "w" "r" element = elemToRun ns element >>= (\r -> return $ PlainRun r) elemToParPart ns element - | isElem ns "w" "ins" element + | isElem ns "w" "ins" element || isElem ns "w" "moveTo" element , Just cId <- findAttr (elemName ns "w" "id") element , Just cAuthor <- findAttr (elemName ns "w" "author") element , Just cDate <- findAttr (elemName ns "w" "date") element = do runs <- mapD (elemToRun ns) (elChildren element) return $ Insertion cId cAuthor cDate runs elemToParPart ns element - | isElem ns "w" "del" element + | isElem ns "w" "del" element || isElem ns "w" "moveFrom" element , Just cId <- findAttr (elemName ns "w" "id") element , Just cAuthor <- findAttr (elemName ns "w" "author") element , Just cDate <- findAttr (elemName ns "w" "date") element = do diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index fb936cff7..8ee5da543 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -971,11 +971,20 @@ htmlTag :: Monad m htmlTag f = try $ do lookAhead (char '<') inp <- getInput - let (next : rest) = canonicalizeTags $ parseTagsOptions - parseOptions{ optTagWarning = True } inp + let (next : _) = canonicalizeTags $ parseTagsOptions + parseOptions{ optTagWarning = False } inp guard $ f next + let handleTag tagname = do + -- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66> + -- should NOT be parsed as an HTML tag, see #2277 + guard $ not ('.' `elem` tagname) + -- <https://example.org> should NOT be a tag either. + -- tagsoup will parse it as TagOpen "https:" [("example.org","")] + guard $ not (null tagname) + guard $ last tagname /= ':' + rendered <- manyTill anyChar (char '>') + return (next, rendered ++ ">") case next of - TagWarning _ -> fail "encountered TagWarning" TagComment s | "<!--" `isPrefixOf` inp -> do count (length s + 4) anyChar @@ -983,13 +992,9 @@ htmlTag f = try $ do char '>' return (next, "<!--" ++ s ++ "-->") | otherwise -> fail "bogus comment mode, HTML5 parse error" - _ -> do - -- we get a TagWarning on things like - -- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66> - -- which should NOT be parsed as an HTML tag, see #2277 - guard $ not $ hasTagWarning rest - rendered <- manyTill anyChar (char '>') - return (next, rendered ++ ">") + TagOpen tagname _attr -> handleTag tagname + TagClose tagname -> handleTag tagname + _ -> mzero mkAttr :: [(String, String)] -> Attr mkAttr attr = (attribsId, attribsClasses, attribsKV) diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index b5d175453..e43714526 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -122,9 +122,6 @@ inList = do ctx <- stateParserContext <$> getState guard (ctx == ListItemState) -isNull :: F Inlines -> Bool -isNull ils = B.isNull $ runF ils def - spnl :: Parser [Char] st () spnl = try $ do skipSpaces @@ -188,31 +185,38 @@ charsInBalancedBrackets openBrackets = -- document structure -- -titleLine :: MarkdownParser (F Inlines) -titleLine = try $ do +rawTitleBlockLine :: MarkdownParser String +rawTitleBlockLine = do char '%' skipSpaces - res <- many $ (notFollowedBy newline >> inline) - <|> try (endline >> whitespace) - newline + first <- anyLine + rest <- many $ try $ do spaceChar + notFollowedBy blankline + skipSpaces + anyLine + return $ trim $ unlines (first:rest) + +titleLine :: MarkdownParser (F Inlines) +titleLine = try $ do + raw <- rawTitleBlockLine + res <- parseFromString (many inline) raw return $ trimInlinesF $ mconcat res authorsLine :: MarkdownParser (F [Inlines]) authorsLine = try $ do - char '%' - skipSpaces - authors <- sepEndBy (many (notFollowedBy (satisfy $ \c -> - c == ';' || c == '\n') >> inline)) - (char ';' <|> - try (newline >> notFollowedBy blankline >> spaceChar)) - newline - return $ sequence $ filter (not . isNull) $ map (trimInlinesF . mconcat) authors + raw <- rawTitleBlockLine + let sep = (char ';' <* spaces) <|> newline + let pAuthors = sepEndBy + (trimInlinesF . mconcat <$> many + (try $ notFollowedBy sep >> inline)) + sep + sequence <$> parseFromString pAuthors raw dateLine :: MarkdownParser (F Inlines) dateLine = try $ do - char '%' - skipSpaces - trimInlinesF . mconcat <$> manyTill inline newline + raw <- rawTitleBlockLine + res <- parseFromString (many inline) raw + return $ trimInlinesF $ mconcat res titleBlock :: MarkdownParser () titleBlock = pandocTitleBlock <|> mmdTitleBlock diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs index 7dd611be3..5e98be31d 100644 --- a/src/Text/Pandoc/Readers/Org.hs +++ b/src/Text/Pandoc/Readers/Org.hs @@ -391,6 +391,9 @@ lookupBlockAttribute key = type BlockProperties = (Int, String) -- (Indentation, Block-Type) +updateIndent :: BlockProperties -> Int -> BlockProperties +updateIndent (_, blkType) indent = (indent, blkType) + orgBlock :: OrgParser (F Blocks) orgBlock = try $ do blockProp@(_, blkType) <- blockHeaderStart @@ -407,11 +410,23 @@ orgBlock = try $ do _ -> withParsed (fmap $ divWithClass blkType) blockHeaderStart :: OrgParser (Int, String) -blockHeaderStart = try $ (,) <$> indent <*> blockType +blockHeaderStart = try $ (,) <$> indentation <*> blockType where - indent = length <$> many spaceChar blockType = map toLower <$> (stringAnyCase "#+begin_" *> orgArgWord) +indentation :: OrgParser Int +indentation = try $ do + tabStop <- getOption readerTabStop + s <- many spaceChar + return $ spaceLength tabStop s + +spaceLength :: Int -> String -> Int +spaceLength tabStop s = (sum . map charLen) s + where + charLen ' ' = 1 + charLen '\t' = tabStop + charLen _ = 0 + withRaw' :: (String -> F Blocks) -> BlockProperties -> OrgParser (F Blocks) withRaw' f blockProp = (ignHeaders *> (f <$> rawBlockContent blockProp)) @@ -450,7 +465,8 @@ codeBlock blkProp = do skipSpaces (classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders) id' <- fromMaybe "" <$> lookupBlockAttribute "name" - content <- rawBlockContent blkProp + leadingIndent <- lookAhead indentation + content <- rawBlockContent (updateIndent blkProp leadingIndent) resultsContent <- followingResultsBlock let includeCode = exportsCode kv let includeResults = exportsResults kv @@ -472,7 +488,7 @@ rawBlockContent (indent, blockType) = try $ unlines . map commaEscaped <$> manyTill indentedLine blockEnder where indentedLine = try $ ("" <$ blankline) <|> (indentWith indent *> anyLine) - blockEnder = try $ indentWith indent *> stringAnyCase ("#+end_" <> blockType) + blockEnder = try $ skipSpaces *> stringAnyCase ("#+end_" <> blockType) parsedBlockContent :: BlockProperties -> OrgParser (F Blocks) parsedBlockContent blkProps = try $ do |