From 24b2ac43b0a8596f7baea10579c95ee75b6e584f Mon Sep 17 00:00:00 2001 From: Albert Krewinkel Date: Tue, 4 Mar 2014 00:33:25 +0100 Subject: Add a simple Emacs Org-mode reader The basic structure of org-mode documents is recognized; however, org-mode features like todo markers, tags etc. are not supported yet. --- src/Text/Pandoc.hs | 3 + src/Text/Pandoc/Readers/Org.hs | 552 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 555 insertions(+) create mode 100644 src/Text/Pandoc/Readers/Org.hs (limited to 'src') diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs index 3ae81db00..e511ed861 100644 --- a/src/Text/Pandoc.hs +++ b/src/Text/Pandoc.hs @@ -65,6 +65,7 @@ module Text.Pandoc , readMarkdown , readMediaWiki , readRST + , readOrg , readLaTeX , readHtml , readTextile @@ -115,6 +116,7 @@ import Text.Pandoc.JSON import Text.Pandoc.Readers.Markdown import Text.Pandoc.Readers.MediaWiki import Text.Pandoc.Readers.RST +import Text.Pandoc.Readers.Org import Text.Pandoc.Readers.DocBook import Text.Pandoc.Readers.OPML import Text.Pandoc.Readers.LaTeX @@ -201,6 +203,7 @@ readers = [ ("native" , \_ s -> return $ readNative s) ,("mediawiki" , \o s -> return $ readMediaWiki o s) ,("docbook" , \o s -> return $ readDocBook o s) ,("opml" , \o s -> return $ readOPML o s) + ,("org" , \o s -> return $ readOrg o s) ,("textile" , \o s -> return $ readTextile o s) -- TODO : textile+lhs ,("html" , \o s -> return $ readHtml o s) ,("latex" , \o s -> return $ readLaTeX o s) diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs new file mode 100644 index 000000000..5dc250f04 --- /dev/null +++ b/src/Text/Pandoc/Readers/Org.hs @@ -0,0 +1,552 @@ +{-# LANGUAGE OverloadedStrings #-} +{- +Copyright (C) 2014 Albert Krewinkel + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Org + Copyright : Copyright (C) 2014 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel + +Conversion of Org-Mode to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Org ( readOrg ) where + +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>)) +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Parsing hiding (orderedListMarker) +import Text.Pandoc.Shared (compactify') + +import Control.Applicative (pure, (<$>), (<$), (<*>), (<*), (*>), (<**>)) +import Control.Monad (guard, mzero) +import Data.Char (toLower) +import Data.List (foldl') +import Data.Maybe (listToMaybe, fromMaybe) +import Data.Monoid (mconcat, mempty, mappend) + +-- | Parse org-mode string and return a Pandoc document. +readOrg :: ReaderOptions -- ^ Reader options + -> String -- ^ String to parse (assuming @'\n'@ line endings) + -> Pandoc +readOrg opts s = (readWith parseOrg) def{ stateOptions = opts } (s ++ "\n\n") + +type OrgParser = Parser [Char] ParserState + +parseOrg:: OrgParser Pandoc +parseOrg = do + blocks' <- B.toList <$> parseBlocks + st <- getState + let meta = stateMeta st + return $ Pandoc meta $ filter (/= Null) blocks' + +-- +-- parsing blocks +-- + +parseBlocks :: OrgParser Blocks +parseBlocks = mconcat <$> manyTill block eof + +block :: OrgParser Blocks +block = choice [ mempty <$ blanklines + , orgBlock + , example + , drawer + , specialLine + , header + , hline + , list + , table + , paraOrPlain + ] "block" + +-- +-- Org Blocks (#+BEGIN_... / #+END_...) +-- + +orgBlock :: OrgParser Blocks +orgBlock = try $ do + (indent, blockType, args) <- blockHeader + blockStr <- rawBlockContent indent blockType + let classArgs = [ translateLang . fromMaybe [] $ listToMaybe args ] + case blockType of + "comment" -> return mempty + "src" -> return $ B.codeBlockWith ("", classArgs, []) blockStr + _ -> B.divWith ("", [blockType], []) + <$> (parseFromString parseBlocks blockStr) + +blockHeader :: OrgParser (Int, String, [String]) +blockHeader = (,,) <$> blockIndent + <*> blockType + <*> (skipSpaces *> blockArgs) + where blockIndent = length <$> many spaceChar + blockType = map toLower <$> (stringAnyCase "#+begin_" *> many letter) + blockArgs = manyTill (many nonspaceChar <* skipSpaces) newline + +rawBlockContent :: Int -> String -> OrgParser String +rawBlockContent indent blockType = + unlines . map commaEscaped <$> manyTill indentedLine blockEnder + where + indentedLine = try $ choice [ blankline *> pure "\n" + , indentWith indent *> anyLine + ] + blockEnder = try $ indentWith indent *> stringAnyCase ("#+end_" <> blockType) + +-- indent by specified number of spaces (or equiv. tabs) +indentWith :: Int -> OrgParser String +indentWith num = do + tabStop <- getOption readerTabStop + if (num < tabStop) + then count num (char ' ') + else choice [ try (count num (char ' ')) + , try (char '\t' >> count (num - tabStop) (char ' ')) ] + +translateLang :: String -> String +translateLang "sh" = "bash" +translateLang cs = cs + +commaEscaped :: String -> String +commaEscaped (',':cs@('*':_)) = cs +commaEscaped (',':cs@('#':'+':_)) = cs +commaEscaped cs = cs + +example :: OrgParser Blocks +example = try $ + B.codeBlockWith ("", ["example"], []) . unlines <$> many1 exampleLine + +exampleLine :: OrgParser String +exampleLine = try $ string ": " *> anyLine + +-- Drawers for properties or a logbook +drawer :: OrgParser Blocks +drawer = try $ do + drawerStart + manyTill drawerLine (try drawerEnd) + return mempty + +drawerStart :: OrgParser String +drawerStart = try $ + skipSpaces *> drawerName <* skipSpaces <* newline + where drawerName = try $ char ':' *> validDrawerName <* char ':' + validDrawerName = stringAnyCase "PROPERTIES" + <|> stringAnyCase "LOGBOOK" + +drawerLine :: OrgParser String +drawerLine = try $ anyLine + +drawerEnd :: OrgParser String +drawerEnd = try $ + skipSpaces *> stringAnyCase ":END:" <* skipSpaces <* newline + + +-- Comments, Options and Metadata +specialLine :: OrgParser Blocks +specialLine = try $ metaLine <|> commentLine + +metaLine :: OrgParser Blocks +metaLine = try $ metaLineStart *> declarationLine + +commentLine :: OrgParser Blocks +commentLine = try $ commentLineStart *> anyLine *> pure mempty + +-- The order, in which blocks are tried, makes sure that we're not looking at +-- the beginning of a block, so we don't need to check for it +metaLineStart :: OrgParser String +metaLineStart = try $ mappend <$> many spaceChar <*> string "#+" + +commentLineStart :: OrgParser String +commentLineStart = try $ mappend <$> many spaceChar <*> string "# " + +declarationLine :: OrgParser Blocks +declarationLine = try $ do + meta' <- B.setMeta <$> metaKey <*> metaValue <*> pure nullMeta + updateState $ \st -> st { stateMeta = stateMeta st <> meta' } + return mempty + +metaValue :: OrgParser MetaValue +metaValue = MetaInlines . B.toList . trimInlines <$> restOfLine + +metaKey :: OrgParser [Char] +metaKey = map toLower <$> many1 (noneOf ": \n\r") + <* char ':' + <* skipSpaces + +-- | Headers +header :: OrgParser Blocks +header = try $ + B.header <$> headerStart + <*> (trimInlines <$> restOfLine) + +headerStart :: OrgParser Int +headerStart = try $ + (length <$> many1 (char '*')) <* many1 (char ' ') + +-- Horizontal Line (five dashes or more) +hline :: OrgParser Blocks +hline = try $ do + skipSpaces + string "-----" + many (char '-') + skipSpaces + newline + return B.horizontalRule + +-- +-- Tables +-- + +data OrgTableRow = OrgContentRow [Blocks] + | OrgAlignRow [Alignment] + | OrgHlineRow + deriving (Eq, Show) + +type OrgTableContent = (Int, [Alignment], [Double], [Blocks], [[Blocks]]) + +table :: OrgParser Blocks +table = try $ do + lookAhead tableStart + (_, aligns, widths, heads, lns) <- normalizeTable . tableContent <$> tableRows + return $ B.table "" (zip aligns widths) heads lns + +tableStart :: OrgParser Char +tableStart = try $ skipSpaces *> char '|' + +tableRows :: OrgParser [OrgTableRow] +tableRows = try $ many (tableAlignRow <|> tableHline <|> tableContentRow) + +tableContentRow :: OrgParser OrgTableRow +tableContentRow = try $ + OrgContentRow <$> (tableStart *> manyTill tableContentCell newline) + +tableContentCell :: OrgParser Blocks +tableContentCell = try $ + B.plain . trimInlines . mconcat <$> many1Till inline (try endOfCell) + +endOfCell :: OrgParser Char +-- endOfCell = char '|' <|> newline +endOfCell = try $ char '|' <|> lookAhead newline + +tableAlignRow :: OrgParser OrgTableRow +tableAlignRow = try $ + OrgAlignRow <$> (tableStart *> manyTill tableAlignCell newline) + +tableAlignCell :: OrgParser Alignment +tableAlignCell = + choice [ try $ emptyCell *> return (AlignDefault) + , try $ skipSpaces + *> char '<' + *> tableAlignFromChar + <* many digit + <* char '>' + <* emptyCell + ] "alignment info" + where emptyCell = try $ skipSpaces *> endOfCell + +tableAlignFromChar :: OrgParser Alignment +tableAlignFromChar = try $ choice [ char 'l' *> return AlignLeft + , char 'c' *> return AlignCenter + , char 'r' *> return AlignRight + ] + +tableHline :: OrgParser OrgTableRow +tableHline = try $ + OrgHlineRow <$ (tableStart *> char '-' *> anyLine) + +tableContent :: [OrgTableRow] + -> OrgTableContent +tableContent = foldl' (flip rowToContent) (0, mempty, repeat 0, mempty, mempty) + +normalizeTable :: OrgTableContent + -> OrgTableContent +normalizeTable (cols, aligns, widths, heads, lns) = + let aligns' = fillColumns aligns AlignDefault + widths' = fillColumns widths 0.0 + heads' = if heads == mempty + then heads + else fillColumns heads (B.plain mempty) + lns' = map (flip fillColumns (B.plain mempty)) lns + fillColumns base padding = take cols $ base ++ repeat padding + in (cols, aligns', widths', heads', lns') + + +-- One or more horizontal rules after the first content line mark the previous +-- line as a header. All other horizontal lines are discarded. +rowToContent :: OrgTableRow + -> OrgTableContent + -> OrgTableContent +rowToContent OrgHlineRow = maybeBodyToHeader +rowToContent (OrgContentRow rs) = setLongestRow rs . appendToBody rs +rowToContent (OrgAlignRow as) = setLongestRow as . setAligns as + +setLongestRow :: [a] + -> OrgTableContent + -> OrgTableContent +setLongestRow r (cols, aligns, widths, heads, lns) = + (max cols (length r), aligns, widths, heads, lns) + +maybeBodyToHeader :: OrgTableContent + -> OrgTableContent +maybeBodyToHeader (cols, aligns, widths, [], b:[]) = (cols, aligns, widths, b, []) +maybeBodyToHeader content = content + +appendToBody :: [Blocks] + -> OrgTableContent + -> OrgTableContent +appendToBody r (cols, aligns, widths, heads, lns) = + (cols, aligns, widths, heads, lns ++ [r]) + +setAligns :: [Alignment] + -> OrgTableContent + -> OrgTableContent +setAligns aligns (cols, _, widths, heads, lns) = + (cols, aligns, widths, heads, lns) + +-- Paragraphs or Plain text +paraOrPlain :: OrgParser Blocks +paraOrPlain = try $ + trimInlines . mconcat + <$> many1 inline + <**> option B.plain + (try $ newline *> pure B.para) + +restOfLine :: OrgParser Inlines +restOfLine = mconcat <$> manyTill inline newline + + +-- +-- list blocks +-- + +list :: OrgParser Blocks +list = choice [ bulletList, orderedList ] "list" + +bulletList :: OrgParser Blocks +bulletList = B.bulletList . compactify' <$> many1 (listItem bulletListStart) + +orderedList :: OrgParser Blocks +orderedList = B.orderedList . compactify' <$> many1 (listItem orderedListStart) + +genericListStart :: OrgParser String + -> OrgParser Int +genericListStart listMarker = try $ + (+) <$> (length <$> many spaceChar) + <*> (length <$> listMarker <* many1 spaceChar) + +-- parses bullet list start and returns its length (excl. following whitespace) +bulletListStart :: OrgParser Int +bulletListStart = genericListStart bulletListMarker + where bulletListMarker = pure <$> oneOf "*-+" + +orderedListStart :: OrgParser Int +orderedListStart = genericListStart orderedListMarker + -- Ordered list markers allowed in org-mode + where orderedListMarker = mappend <$> many1 digit <*> (pure <$> oneOf ".)") + +listItem :: OrgParser Int + -> OrgParser Blocks +listItem start = try $ do + (markerLength, first) <- try (start >>= rawListItem) + rest <- many (listContinuation markerLength) + parseFromString parseBlocks $ concat (first:rest) + +-- parse raw text for one list item, excluding start marker and continuations +rawListItem :: Int + -> OrgParser (Int, String) +rawListItem markerLength = try $ do + firstLine <- anyLine + restLines <- many (listLine markerLength) + return (markerLength, (firstLine ++ "\n" ++ (concat restLines))) + +-- continuation of a list item - indented and separated by blankline or endline. +-- Note: nested lists are parsed as continuations. +listContinuation :: Int + -> OrgParser String +listContinuation markerLength = try $ + mappend <$> many blankline + <*> (concat <$> many1 (listLine markerLength)) + +-- parse a line of a list item +listLine :: Int + -> OrgParser String +listLine markerLength = try $ + indentWith markerLength *> anyLine + <**> pure (++ "\n") + + +-- +-- inline +-- + +inline :: OrgParser Inlines +inline = choice inlineParsers "inline" + where inlineParsers = [ whitespace + , link + , str + , endline + , emph + , strong + , strikeout + , underline + , code + , verbatim + , subscript + , superscript + , symbol + ] + +-- treat these as potentially non-text when parsing inline: +specialChars :: [Char] +specialChars = "\"$'()*+-./:<=>[\\]^_{|}~" + + +whitespace :: OrgParser Inlines +whitespace = B.space <$ skipMany1 spaceChar "whitespace" + +str :: OrgParser Inlines +str = B.str <$> many1 (noneOf $ specialChars ++ "\n\r ") + <* updateLastStrPos + +-- an endline character that can be treated as a space, not a structural break +endline :: OrgParser Inlines +endline = try $ do + newline + notFollowedBy blankline + notFollowedBy' exampleLine + notFollowedBy' hline + notFollowedBy' tableStart + notFollowedBy' drawerStart + notFollowedBy' headerStart + notFollowedBy' metaLineStart + notFollowedBy' commentLineStart + notFollowedBy' bulletListStart + notFollowedBy' orderedListStart + return B.space + +link :: OrgParser Inlines +link = explicitLink <|> selfLink "link" + +explicitLink :: OrgParser Inlines +explicitLink = try $ do + char '[' + src <- enclosedRaw (char '[') (char ']') + title <- enclosedInlines (char '[') (char ']') + char ']' + return $ B.link src "" title + +selfLink :: OrgParser Inlines +selfLink = try $ do + src <- enclosedRaw (string "[[") (string "]]") + return $ B.link src "" (B.str src) + +emph :: OrgParser Inlines +emph = B.emph <$> inlinesEnclosedBy '/' + +strong :: OrgParser Inlines +strong = B.strong <$> inlinesEnclosedBy '*' + +strikeout :: OrgParser Inlines +strikeout = B.strikeout <$> inlinesEnclosedBy '+' + +-- There is no underline, so we use strong instead. +underline :: OrgParser Inlines +underline = B.strong <$> inlinesEnclosedBy '_' + +code :: OrgParser Inlines +code = B.code <$> rawEnclosedBy '=' + +verbatim :: OrgParser Inlines +verbatim = B.rawInline "" <$> rawEnclosedBy '~' + +subscript :: OrgParser Inlines +subscript = B.subscript <$> (try $ char '_' *> maybeGroupedByBraces) + +superscript :: OrgParser Inlines +superscript = B.superscript <$> (try $ char '^' *> maybeGroupedByBraces) + +maybeGroupedByBraces :: OrgParser Inlines +maybeGroupedByBraces = try $ + choice [ try $ enclosedInlines (char '{') (char '}') + , B.str . (:"") <$> anyChar + ] + +symbol :: OrgParser Inlines +symbol = B.str . (: "") <$> oneOf specialChars + +enclosedInlines :: OrgParser a + -> OrgParser b + -> OrgParser Inlines +enclosedInlines start end = try $ + trimInlines . mconcat <$> enclosed start end inline + +-- FIXME: This is a hack +inlinesEnclosedBy :: Char + -> OrgParser Inlines +inlinesEnclosedBy c = enclosedInlines (atStart (char c) <* endsOnThisOrNextLine c) + (atEnd $ char c) + +enclosedRaw :: OrgParser a + -> OrgParser b + -> OrgParser String +enclosedRaw start end = try $ + start *> (onSingleLine <|> spanningTwoLines) + where onSingleLine = try $ many1Till (noneOf "\n\r") end + spanningTwoLines = try $ + anyLine >>= \f -> mappend (f <> " ") <$> onSingleLine + +rawEnclosedBy :: Char + -> OrgParser String +rawEnclosedBy c = enclosedRaw (atStart $ char c) (atEnd $ char c) + +-- succeeds only if we're not right after a str (ie. in middle of word) +atStart :: OrgParser a -> OrgParser a +atStart p = do + pos <- getPosition + st <- getState + guard $ stateLastStrPos st /= Just pos + p + +-- | succeeds only if we're at the end of a word +atEnd :: OrgParser a -> OrgParser a +atEnd p = try $ p <* lookingAtEndOfWord + where lookingAtEndOfWord = lookAhead . oneOf $ postWordChars + +postWordChars :: [Char] +postWordChars = "\t\n\r !\"'),-.:?}" + +-- FIXME: These functions are hacks and should be replaced +endsOnThisOrNextLine :: Char + -> OrgParser () +endsOnThisOrNextLine c = do + inp <- getInput + let doOtherwise = \rest -> endsOnThisLine rest c (const mzero) + endsOnThisLine inp c doOtherwise + +endsOnThisLine :: [Char] + -> Char + -> ([Char] -> OrgParser ()) + -> OrgParser () +endsOnThisLine input c doOnOtherLines = do + case break (`elem` c:"\n") input of + (_,'\n':rest) -> doOnOtherLines rest + (_,_:rest@(n:_)) -> if n `elem` postWordChars + then return () + else endsOnThisLine rest c doOnOtherLines + _ -> mzero + -- cgit v1.2.3