From 7760504bb26f215e7d0c57da843f1f1dcc8c1186 Mon Sep 17 00:00:00 2001
From: Albert Krewinkel <tarleb@moltkeplatz.de>
Date: Thu, 8 May 2014 17:01:58 +0200
Subject: Org reader: refactor #+BEGIN..#+END block parsing code

---
 src/Text/Pandoc/Readers/Org.hs | 122 +++++++++++++++++++++++++++--------------
 1 file changed, 80 insertions(+), 42 deletions(-)

(limited to 'src/Text')

diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs
index dba61dfe0..9df8ce0b3 100644
--- a/src/Text/Pandoc/Readers/Org.hs
+++ b/src/Text/Pandoc/Readers/Org.hs
@@ -50,7 +50,7 @@ import           Data.Char (isAlphaNum, toLower)
 import           Data.Default
 import           Data.List (intersperse, isPrefixOf, isSuffixOf)
 import qualified Data.Map as M
-import           Data.Maybe (listToMaybe, fromMaybe, isJust)
+import           Data.Maybe (fromMaybe, isJust)
 import           Data.Monoid (Monoid, mconcat, mempty, mappend)
 import           Network.HTTP (urlEncode)
 
@@ -162,7 +162,8 @@ popInlineCharStack = updateState $ \s ->
   s{ orgStateEmphasisCharStack = drop 1 . orgStateEmphasisCharStack $ s }
 
 surroundingEmphasisChar :: OrgParser [Char]
-surroundingEmphasisChar = take 1 . drop 1 . orgStateEmphasisCharStack <$> getState
+surroundingEmphasisChar =
+  take 1 . drop 1 . orgStateEmphasisCharStack <$> getState
 
 startEmphasisNewlinesCounting :: Int -> OrgParser ()
 startEmphasisNewlinesCounting maxNewlines = updateState $ \s ->
@@ -170,7 +171,7 @@ startEmphasisNewlinesCounting maxNewlines = updateState $ \s ->
 
 decEmphasisNewlinesCount :: OrgParser ()
 decEmphasisNewlinesCount = updateState $ \s ->
-      s{ orgStateEmphasisNewlines = (\n -> n - 1) <$> orgStateEmphasisNewlines s }
+  s{ orgStateEmphasisNewlines = (\n -> n - 1) <$> orgStateEmphasisNewlines s }
 
 newlinesCountWithinLimits :: OrgParser Bool
 newlinesCountWithinLimits = do
@@ -296,41 +297,60 @@ lookupInlinesAttr attr = try $ do
 -- Org Blocks (#+BEGIN_... / #+END_...)
 --
 
+type BlockProperties = (Int, String)  -- (Indentation, Block-Type)
+
 orgBlock :: OrgParser (F Blocks)
 orgBlock = try $ do
-  (indent, blockType, args) <- blockHeader
-  content <- rawBlockContent indent blockType
-  contentBlocks <- parseFromString parseBlocks (content ++ "\n")
-  let classArgs = [ translateLang . fromMaybe [] $ listToMaybe args ]
-  case blockType of
-    "comment" -> return mempty
-    "html"    -> returnF $ B.rawBlock "html" content
-    "latex"   -> returnF $ B.rawBlock "latex" content
-    "ascii"   -> returnF $ B.rawBlock "ascii" content
-    "example" -> returnF $ exampleCode content
-    "quote"   -> return  $ B.blockQuote <$> contentBlocks
-    "verse"   -> parseVerse content
-    "src"     -> codeBlockWithAttr classArgs content
-    _         -> return  $ B.divWith ("", [blockType], []) <$> contentBlocks
+  blockProp@(_, blkType) <- blockHeaderStart
+  ($ blockProp) $
+    case blkType of
+      "comment" -> withRaw'   (const mempty)
+      "html"    -> withRaw'   (return . (B.rawBlock blkType))
+      "latex"   -> withRaw'   (return . (B.rawBlock blkType))
+      "ascii"   -> withRaw'   (return . (B.rawBlock blkType))
+      "example" -> withRaw'   (return . exampleCode)
+      "quote"   -> withParsed (fmap B.blockQuote)
+      "verse"   -> verseBlock
+      "src"     -> codeBlock
+      _         -> withParsed (fmap $ divWithClass blkType)
+
+blockHeaderStart :: OrgParser (Int, String)
+blockHeaderStart = try $ (,) <$> indent <*> blockType
  where
-   parseVerse :: String -> OrgParser (F Blocks)
-   parseVerse cs =
-       fmap B.para . mconcat . intersperse (pure B.linebreak)
-       <$> mapM (parseFromString parseInlines) (lines cs)
-
-blockHeader :: OrgParser (Int, String, [String])
-blockHeader = (,,) <$> blockIndent
-                   <*> blockType
-                   <*> (skipSpaces *> blockArgs)
- where blockIndent = length <$> many spaceChar
-       blockType = map toLower <$> (stringAnyCase "#+begin_" *> many letter)
-       blockArgs = manyTill (many nonspaceChar <* skipSpaces) newline
-
-codeBlockWithAttr :: [String] -> String -> OrgParser (F Blocks)
-codeBlockWithAttr classArgs content = do
-  identifier <- fromMaybe "" <$> lookupBlockAttribute "name"
-  caption <- lookupInlinesAttr "caption"
-  let codeBlck = B.codeBlockWith (identifier, classArgs, []) content
+  indent    = length      <$> many spaceChar
+  blockType = map toLower <$> (stringAnyCase "#+begin_" *> many orgArgWordChar)
+
+withRaw'   :: (String   -> F Blocks) -> BlockProperties -> OrgParser (F Blocks)
+withRaw'   f blockProp = (ignHeaders *> (f <$> rawBlockContent blockProp))
+
+withParsed :: (F Blocks -> F Blocks) -> BlockProperties -> OrgParser (F Blocks)
+withParsed f blockProp = (ignHeaders *> (f <$> parsedBlockContent blockProp))
+
+ignHeaders :: OrgParser ()
+ignHeaders = (() <$ newline) <|> (() <$ anyLine)
+
+divWithClass :: String -> Blocks -> Blocks
+divWithClass cls = B.divWith ("", [cls], [])
+
+verseBlock :: BlockProperties -> OrgParser (F Blocks)
+verseBlock blkProp = try $ do
+  ignHeaders
+  content <- rawBlockContent blkProp
+  fmap B.para . mconcat . intersperse (pure B.linebreak)
+    <$> mapM (parseFromString parseInlines) (lines content)
+
+codeBlock :: BlockProperties -> OrgParser (F Blocks)
+codeBlock blkProp = do
+  skipSpaces
+  language      <- optionMaybe orgArgWord
+  (classes, kv) <- codeHeaderArgs
+  id'           <- fromMaybe "" <$> lookupBlockAttribute "name"
+  caption       <- lookupInlinesAttr "caption"
+  content       <- rawBlockContent blkProp
+  let attr      = ( id'
+                  , maybe id (\l -> (l:)) language $ classes
+                  , kv )
+  let codeBlck  = B.codeBlockWith attr content
   return $ maybe (pure codeBlck) (labelDiv codeBlck) caption
  where
    labelDiv blk value =
@@ -338,14 +358,21 @@ codeBlockWithAttr classArgs content = do
                                        <*> pure blk)
    labelledBlock = fmap (B.plain . B.spanWith ("", ["label"], []))
 
-rawBlockContent :: Int -> String -> OrgParser String
-rawBlockContent indent blockType =
+rawBlockContent :: BlockProperties -> OrgParser String
+rawBlockContent (indent, blockType) = try $
   unlines . map commaEscaped <$> manyTill indentedLine blockEnder
  where
-   indentedLine = try $ choice [ blankline         *> pure "\n"
-                               , indentWith indent *> anyLine
-                               ]
-   blockEnder = try $ indentWith indent *> stringAnyCase ("#+end_" <> blockType)
+   indentedLine = try $
+     choice [ blankline         *> pure "\n"
+            , indentWith indent *> anyLine
+            ]
+   blockEnder = try $
+     indentWith indent *> stringAnyCase ("#+end_" <> blockType)
+
+parsedBlockContent :: BlockProperties -> OrgParser (F Blocks)
+parsedBlockContent blkProps = try $ do
+  raw <- rawBlockContent blkProps
+  parseFromString parseBlocks (raw ++ "\n")
 
 -- indent by specified number of spaces (or equiv. tabs)
 indentWith :: Int -> OrgParser String
@@ -356,6 +383,13 @@ indentWith num = do
      else choice [ try (count num (char ' '))
                  , try (char '\t' >> count (num - tabStop) (char ' ')) ]
 
+orgArgWord :: OrgParser String
+orgArgWord = many1 orgArgWordChar
+
+codeHeaderArgs :: OrgParser ([String], [(String, String)])
+codeHeaderArgs =
+  (\x -> (x, [])) <$> manyTill (many nonspaceChar <* skipSpaces) newline
+
 translateLang :: String -> String
 translateLang "C"          = "c"
 translateLang "C++"        = "cpp"
@@ -1002,9 +1036,13 @@ inlineCodeBlock = try $ do
   returnF $ B.codeWith ("", attrClasses, attrKeyVal) inlineCode
  where enclosedByPair s e p = char s *> many1Till p (char e)
 
+-- | Prefix used for Rundoc classes and arguments.
+rundocPrefix :: String
+rundocPrefix = "rundoc-"
+
 -- | The class-name used to mark rundoc blocks.
 rundocBlockClass :: String
-rundocBlockClass = "rundoc-block"
+rundocBlockClass = rundocPrefix ++ "block"
 
 blockOption :: OrgParser (String, String)
 blockOption = try $ (,) <$> orgArgKey <*> orgArgValue
-- 
cgit v1.2.3


From 757c4f68f3f3cab99db9499936e3ae4775ebbddf Mon Sep 17 00:00:00 2001
From: Albert Krewinkel <tarleb@moltkeplatz.de>
Date: Fri, 9 May 2014 18:07:37 +0200
Subject: Org reader: Support arguments for code blocks

The general form of source block headers
(`#+BEGIN_SRC <language> <switches> <header arguments>`) was not
recognized by the reader.  This patch adds support for the above form,
adds header arguments to the block's key-value pairs and marks the block
as a rundoc block if header arguments are present.

This closes #1286.
---
 src/Text/Pandoc/Readers/Org.hs | 98 ++++++++++++++++++++++++------------------
 tests/Tests/Readers/Org.hs     | 14 ++++++
 2 files changed, 70 insertions(+), 42 deletions(-)

(limited to 'src/Text')

diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs
index 9df8ce0b3..c05ac92d0 100644
--- a/src/Text/Pandoc/Readers/Org.hs
+++ b/src/Text/Pandoc/Readers/Org.hs
@@ -276,7 +276,7 @@ parseBlockAttributes = do
  where
    attribute :: OrgParser (String, String)
    attribute = try $ do
-         key <- metaLineStart *> many1Till (noneOf "\n\r") (char ':')
+         key <- metaLineStart *> many1Till nonspaceChar (char ':')
          val <- skipSpaces *> anyLine
          return (map toLower key, val)
 
@@ -342,16 +342,11 @@ verseBlock blkProp = try $ do
 codeBlock :: BlockProperties -> OrgParser (F Blocks)
 codeBlock blkProp = do
   skipSpaces
-  language      <- optionMaybe orgArgWord
-  (classes, kv) <- codeHeaderArgs
+  (classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders)
   id'           <- fromMaybe "" <$> lookupBlockAttribute "name"
-  caption       <- lookupInlinesAttr "caption"
   content       <- rawBlockContent blkProp
-  let attr      = ( id'
-                  , maybe id (\l -> (l:)) language $ classes
-                  , kv )
-  let codeBlck  = B.codeBlockWith attr content
-  return $ maybe (pure codeBlck) (labelDiv codeBlck) caption
+  let codeBlck  = B.codeBlockWith ( id', classes, kv ) content
+  maybe (pure codeBlck) (labelDiv codeBlck) <$> lookupInlinesAttr "caption"
  where
    labelDiv blk value =
        B.divWith nullAttr <$> (mappend <$> labelledBlock value
@@ -383,12 +378,33 @@ indentWith num = do
      else choice [ try (count num (char ' '))
                  , try (char '\t' >> count (num - tabStop) (char ' ')) ]
 
+type SwitchOption = (Char, Maybe String)
+
 orgArgWord :: OrgParser String
 orgArgWord = many1 orgArgWordChar
 
+-- | Parse code block arguments
+-- TODO: We currently don't handle switches.
 codeHeaderArgs :: OrgParser ([String], [(String, String)])
-codeHeaderArgs =
-  (\x -> (x, [])) <$> manyTill (many nonspaceChar <* skipSpaces) newline
+codeHeaderArgs = try $ do
+  language   <- skipSpaces *> orgArgWord
+  _          <- skipSpaces *> (try $ switch `sepBy` (many1 spaceChar))
+  parameters <- manyTill blockOption newline
+  let pandocLang = translateLang language
+  return $
+    if hasRundocParameters parameters
+    then ( [ pandocLang, rundocBlockClass ]
+         , map toRundocAttrib (("language", language) : parameters)
+         )
+    else ([ pandocLang ], parameters)
+ where hasRundocParameters = not . null
+
+switch :: OrgParser SwitchOption
+switch = try $ simpleSwitch <|> lineNumbersSwitch
+ where
+   simpleSwitch = (\c -> (c, Nothing)) <$> (oneOf "-+" *> letter)
+   lineNumbersSwitch = (\ls -> ('l', Just ls)) <$>
+                       (string "-l \"" *> many1Till nonspaceChar (char '"'))
 
 translateLang :: String -> String
 translateLang "C"          = "c"
@@ -401,6 +417,32 @@ translateLang "sh"         = "bash"
 translateLang "sqlite"     = "sql"
 translateLang cs = cs
 
+-- | Prefix used for Rundoc classes and arguments.
+rundocPrefix :: String
+rundocPrefix = "rundoc-"
+
+-- | The class-name used to mark rundoc blocks.
+rundocBlockClass :: String
+rundocBlockClass = rundocPrefix ++ "block"
+
+blockOption :: OrgParser (String, String)
+blockOption = try $ (,) <$> orgArgKey <*> orgArgValue
+
+orgArgKey :: OrgParser String
+orgArgKey = try $
+  skipSpaces *> char ':'
+             *> many1 orgArgWordChar
+
+orgArgValue :: OrgParser String
+orgArgValue = try $
+  skipSpaces *> many1 orgArgWordChar <* skipSpaces
+
+orgArgWordChar :: OrgParser Char
+orgArgWordChar = alphaNum <|> oneOf "-_"
+
+toRundocAttrib :: (String, String) -> (String, String)
+toRundocAttrib = first ("rundoc-" ++)
+
 commaEscaped :: String -> String
 commaEscaped (',':cs@('*':_))     = cs
 commaEscaped (',':cs@('#':'+':_)) = cs
@@ -425,7 +467,7 @@ drawer = try $ do
 
 drawerStart :: OrgParser String
 drawerStart = try $
-  skipSpaces *> drawerName <* skipSpaces <* newline
+  skipSpaces *> drawerName <* skipSpaces <* P.newline
  where drawerName = try $  char ':' *> validDrawerName <* char ':'
        validDrawerName =  stringAnyCase "PROPERTIES"
                           <|> stringAnyCase "LOGBOOK"
@@ -435,7 +477,7 @@ drawerLine = try anyLine
 
 drawerEnd :: OrgParser String
 drawerEnd = try $
-  skipSpaces *> stringAnyCase ":END:" <* skipSpaces <* newline
+  skipSpaces *> stringAnyCase ":END:" <* skipSpaces <* P.newline
 
 
 --
@@ -446,7 +488,7 @@ drawerEnd = try $
 figure :: OrgParser (F Blocks)
 figure = try $ do
   (cap, nam) <- nameAndCaption
-  src <- skipSpaces *> selfTarget <* skipSpaces <* newline
+  src <- skipSpaces *> selfTarget <* skipSpaces <* P.newline
   guard (isImageFilename src)
   return $ do
     cap' <- cap
@@ -1036,34 +1078,6 @@ inlineCodeBlock = try $ do
   returnF $ B.codeWith ("", attrClasses, attrKeyVal) inlineCode
  where enclosedByPair s e p = char s *> many1Till p (char e)
 
--- | Prefix used for Rundoc classes and arguments.
-rundocPrefix :: String
-rundocPrefix = "rundoc-"
-
--- | The class-name used to mark rundoc blocks.
-rundocBlockClass :: String
-rundocBlockClass = rundocPrefix ++ "block"
-
-blockOption :: OrgParser (String, String)
-blockOption = try $ (,) <$> orgArgKey <*> orgArgValue
-
-orgArgKey :: OrgParser String
-orgArgKey = try $
-  skipSpaces *> char ':'
-             *> many1 orgArgWordChar
-             <* many1 spaceChar
-
-orgArgValue :: OrgParser String
-orgArgValue = try $
-  skipSpaces *> many1 orgArgWordChar
-             <* skipSpaces
-
-orgArgWordChar :: OrgParser Char
-orgArgWordChar = alphaNum <|> oneOf "-_"
-
-toRundocAttrib :: (String, String) -> (String, String)
-toRundocAttrib = first ("rundoc-" ++)
-
 emph      :: OrgParser (F Inlines)
 emph      = fmap B.emph         <$> emphasisBetween '/'
 
diff --git a/tests/Tests/Readers/Org.hs b/tests/Tests/Readers/Org.hs
index 949976aba..a78e8861f 100644
--- a/tests/Tests/Readers/Org.hs
+++ b/tests/Tests/Readers/Org.hs
@@ -822,6 +822,20 @@ tests =
            in mconcat [ para $ spcSep [ "Low", "German", "greeting"  ]
                       , codeBlockWith attr' code'
                       ]
+      , "Source block with rundoc/babel arguments" =:
+           unlines [ "#+BEGIN_SRC emacs-lisp :exports both"
+                   , "(progn (message \"Hello, World!\")"
+                   , "       (+ 23 42))"
+                   , "#+END_SRC" ] =?>
+           let classes = [ "commonlisp"  -- as kate doesn't know emacs-lisp syntax
+                         , "rundoc-block"
+                         ]
+               params = [ ("rundoc-language", "emacs-lisp")
+                        , ("rundoc-exports", "both")
+                        ]
+               code' = unlines [ "(progn (message \"Hello, World!\")"
+                               , "       (+ 23 42))" ]
+           in codeBlockWith ("", classes, params) code'
 
       , "Example block" =:
            unlines [ "#+begin_example"
-- 
cgit v1.2.3


From 07694b30184bcf2ed0e2998016df394f47a1996f Mon Sep 17 00:00:00 2001
From: Albert Krewinkel <tarleb@moltkeplatz.de>
Date: Fri, 9 May 2014 18:23:23 +0200
Subject: Org reader: Fix parsing of blank lines within blocks

Blank lines were parsed as two newlines instead of just one.
Thanks to Xiao Hanyu (@xiaohanyu) for pointing this out.
---
 src/Text/Pandoc/Readers/Org.hs | 8 ++------
 tests/Tests/Readers/Org.hs     | 9 +++++++++
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'src/Text')

diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs
index c05ac92d0..0f218d43f 100644
--- a/src/Text/Pandoc/Readers/Org.hs
+++ b/src/Text/Pandoc/Readers/Org.hs
@@ -357,12 +357,8 @@ rawBlockContent :: BlockProperties -> OrgParser String
 rawBlockContent (indent, blockType) = try $
   unlines . map commaEscaped <$> manyTill indentedLine blockEnder
  where
-   indentedLine = try $
-     choice [ blankline         *> pure "\n"
-            , indentWith indent *> anyLine
-            ]
-   blockEnder = try $
-     indentWith indent *> stringAnyCase ("#+end_" <> blockType)
+   indentedLine = try $ ("" <$ blankline) <|> (indentWith indent *> anyLine)
+   blockEnder = try $ indentWith indent *> stringAnyCase ("#+end_" <> blockType)
 
 parsedBlockContent :: BlockProperties -> OrgParser (F Blocks)
 parsedBlockContent blkProps = try $ do
diff --git a/tests/Tests/Readers/Org.hs b/tests/Tests/Readers/Org.hs
index a78e8861f..87b0d0c90 100644
--- a/tests/Tests/Readers/Org.hs
+++ b/tests/Tests/Readers/Org.hs
@@ -920,5 +920,14 @@ tests =
                              (unlines [ "fmap id = id"
                                       , "fmap (p . q) = (fmap p) . (fmap q)"
                                       ])))
+
+      , "Convert blank lines in blocks to single newlines" =:
+          unlines [ "#+begin_html"
+                  , ""
+                  , "<span>boring</span>"
+                  , ""
+                  , "#+end_html"
+                  ] =?>
+          rawBlock "html" "\n<span>boring</span>\n\n"
       ]
   ]
-- 
cgit v1.2.3