aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorDavid Lazar <lazar6@illinois.edu>2013-03-28 14:53:10 -0700
committerDavid Lazar <lazar6@illinois.edu>2013-03-28 14:53:10 -0700
commit18459b95bab91b2a1d59e888045d555494d94bc8 (patch)
tree1974f9b339f37f9bdb90e309388ef5a436f7bcfe /src
parentee0fc19bc54208c5d8828eab872e3bbe303c47bf (diff)
downloadpandoc-18459b95bab91b2a1d59e888045d555494d94bc8.tar.gz
Add reader for Haddock markup based on Haddock's own lexer/parser.
Diffstat (limited to 'src')
-rw-r--r--src/Text/Pandoc.hs3
-rw-r--r--src/Text/Pandoc/Readers/Haddock.hs39
-rw-r--r--src/Text/Pandoc/Readers/Haddock/Lex.x169
-rw-r--r--src/Text/Pandoc/Readers/Haddock/Parse.y179
4 files changed, 390 insertions, 0 deletions
diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs
index 89faf140c..cd2aa0fd3 100644
--- a/src/Text/Pandoc.hs
+++ b/src/Text/Pandoc.hs
@@ -73,6 +73,7 @@ module Text.Pandoc
, readTextile
, readDocBook
, readOPML
+ , readHaddock
, readNative
-- * Writers: converting /from/ Pandoc format
, Writer (..)
@@ -120,6 +121,7 @@ import Text.Pandoc.Readers.LaTeX
import Text.Pandoc.Readers.HTML
import Text.Pandoc.Readers.Textile
import Text.Pandoc.Readers.Native
+import Text.Pandoc.Readers.Haddock
import Text.Pandoc.Writers.Native
import Text.Pandoc.Writers.Markdown
import Text.Pandoc.Writers.RST
@@ -200,6 +202,7 @@ readers = [("native" , \_ s -> return $ readNative s)
,("textile" , \o s -> return $ readTextile o s) -- TODO : textile+lhs
,("html" , \o s -> return $ readHtml o s)
,("latex" , \o s -> return $ readLaTeX o s)
+ ,("haddock" , \o s -> return $ readHaddock o s)
]
data Writer = PureStringWriter (WriterOptions -> Pandoc -> String)
diff --git a/src/Text/Pandoc/Readers/Haddock.hs b/src/Text/Pandoc/Readers/Haddock.hs
new file mode 100644
index 000000000..49154b0ca
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Haddock.hs
@@ -0,0 +1,39 @@
+{- |
+ Module : Text.Pandoc.Readers.Haddock
+ Copyright : Copyright (C) 2013 David Lazar
+ License : GNU GPL, version 2 or above
+
+ Maintainer : David Lazar <lazar6@illinois.edu>
+ Stability : alpha
+
+Conversion of Haddock markup to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.Haddock
+ ( readHaddock
+ ) where
+
+import Text.Pandoc.Builder
+import Text.Pandoc.Options
+import Text.Pandoc.Readers.Haddock.Lex
+import Text.Pandoc.Readers.Haddock.Parse
+
+-- | Parse Haddock markup and return a 'Pandoc' document.
+readHaddock :: ReaderOptions -- ^ Reader options
+ -> String -- ^ String to parse
+ -> Pandoc
+readHaddock _ s = Pandoc (Meta [] [] []) blocks
+ where
+ blocks = case parseParas (tokenise s (0,0)) of
+ Nothing -> []
+ Just x -> mergeLists (toList x)
+
+-- similar to 'docAppend' in Haddock.Doc
+mergeLists :: [Block] -> [Block]
+mergeLists (BulletList xs : BulletList ys : blocks)
+ = mergeLists (BulletList (xs ++ ys) : blocks)
+mergeLists (OrderedList _ xs : OrderedList a ys : blocks)
+ = mergeLists (OrderedList a (xs ++ ys) : blocks)
+mergeLists (DefinitionList xs : DefinitionList ys : blocks)
+ = mergeLists (DefinitionList (xs ++ ys) : blocks)
+mergeLists (x : blocks) = x : mergeLists blocks
+mergeLists [] = []
diff --git a/src/Text/Pandoc/Readers/Haddock/Lex.x b/src/Text/Pandoc/Readers/Haddock/Lex.x
new file mode 100644
index 000000000..902ac84c0
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Haddock/Lex.x
@@ -0,0 +1,169 @@
+--
+-- Haddock - A Haskell Documentation Tool
+--
+-- (c) Simon Marlow 2002
+--
+-- This file was modified and integrated into GHC by David Waern 2006.
+-- Then moved back into Haddock by Isaac Dupree in 2009 :-)
+-- Then copied into Pandoc by David Lazar in 2013 :-D
+
+{
+{-# LANGUAGE BangPatterns #-} -- Generated by Alex
+{-# OPTIONS -Wwarn -w #-}
+-- The above warning supression flag is a temporary kludge.
+-- While working on this module you are encouraged to remove it and fix
+-- any warnings in the module. See
+-- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
+-- for details
+
+module Text.Pandoc.Readers.Haddock.Lex (
+ Token(..),
+ LToken,
+ tokenise
+ ) where
+
+import Data.Char
+import Numeric (readHex)
+}
+
+%wrapper "posn"
+
+$ws = $white # \n
+$digit = [0-9]
+$hexdigit = [0-9a-fA-F]
+$special = [\"\@]
+$alphanum = [A-Za-z0-9]
+$ident = [$alphanum \'\_\.\!\#\$\%\&\*\+\/\<\=\>\?\@\\\\\^\|\-\~\:]
+
+:-
+
+-- beginning of a paragraph
+<0,para> {
+ $ws* \n ;
+ $ws* \> { begin birdtrack }
+ $ws* prop \> .* \n { strtoken TokProperty `andBegin` property}
+ $ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr }
+ $ws* [\*\-] { token TokBullet `andBegin` string }
+ $ws* \[ { token TokDefStart `andBegin` def }
+ $ws* \( $digit+ \) { token TokNumber `andBegin` string }
+ $ws* $digit+ \. { token TokNumber `andBegin` string }
+ $ws* { begin string }
+}
+
+-- beginning of a line
+<line> {
+ $ws* \> { begin birdtrack }
+ $ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr }
+ $ws* \n { token TokPara `andBegin` para }
+
+ -- Here, we really want to be able to say
+ -- $ws* (\n | <eof>) { token TokPara `andBegin` para}
+ -- because otherwise a trailing line of whitespace will result in
+ -- a spurious TokString at the end of a docstring. We don't have <eof>,
+ -- though (NOW I realise what it was for :-). To get around this, we always
+ -- append \n to the end of a docstring.
+ () { begin string }
+}
+
+<birdtrack> .* \n? { strtokenNL TokBirdTrack `andBegin` line }
+
+<property> () { token TokPara `andBegin` para }
+
+<example> {
+ $ws* \n { token TokPara `andBegin` para }
+ $ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr }
+ () { begin exampleresult }
+}
+
+<exampleexpr> .* \n { strtokenNL TokExampleExpression `andBegin` example }
+
+<exampleresult> .* \n { strtokenNL TokExampleResult `andBegin` example }
+
+<string,def> {
+ $special { strtoken $ \s -> TokSpecial (head s) }
+ \<\< [^\>]* \>\> { strtoken $ \s -> TokPic (init $ init $ tail $ tail s) }
+ \< [^\>]* \> { strtoken $ \s -> TokURL (init (tail s)) }
+ \# [^\#]* \# { strtoken $ \s -> TokAName (init (tail s)) }
+ \/ [^\/]* \/ { strtoken $ \s -> TokEmphasis (init (tail s)) }
+ [\'\`] $ident+ [\'\`] { strtoken $ \s -> TokIdent (init (tail s)) }
+ \\ . { strtoken (TokString . tail) }
+ "&#" $digit+ \; { strtoken $ \s -> TokString [chr (read (init (drop 2 s)))] }
+ "&#" [xX] $hexdigit+ \;
+ { strtoken $ \s -> case readHex (init (drop 3 s)) of [(n,_)] -> TokString [chr n] }
+ -- allow special characters through if they don't fit one of the previous
+ -- patterns.
+ [\/\'\`\<\#\&\\] { strtoken TokString }
+ [^ $special \/ \< \# \n \'\` \& \\ \]]* \n { strtokenNL TokString `andBegin` line }
+ [^ $special \/ \< \# \n \'\` \& \\ \]]+ { strtoken TokString }
+}
+
+<def> {
+ \] { token TokDefEnd `andBegin` string }
+}
+
+-- ']' doesn't have any special meaning outside of the [...] at the beginning
+-- of a definition paragraph.
+<string> {
+ \] { strtoken TokString }
+}
+
+{
+-- | A located token
+type LToken = (Token, AlexPosn)
+
+data Token
+ = TokPara
+ | TokNumber
+ | TokBullet
+ | TokDefStart
+ | TokDefEnd
+ | TokSpecial Char
+ | TokIdent String
+ | TokString String
+ | TokURL String
+ | TokPic String
+ | TokEmphasis String
+ | TokAName String
+ | TokBirdTrack String
+ | TokProperty String
+ | TokExamplePrompt String
+ | TokExampleExpression String
+ | TokExampleResult String
+-- deriving Show
+
+tokenPos :: LToken -> (Int, Int)
+tokenPos t = let AlexPn _ line col = snd t in (line, col)
+
+type StartCode = Int
+type Action = AlexPosn -> String -> StartCode -> (StartCode -> [LToken]) -> [LToken]
+
+tokenise :: String -> (Int, Int) -> [LToken]
+tokenise str (line, col) = go (posn,'\n',[],eofHack str) para
+ where posn = AlexPn 0 line col
+ go inp@(pos,_,_,str) sc =
+ case alexScan inp sc of
+ AlexEOF -> []
+ AlexError _ -> []
+ AlexSkip inp' len -> go inp' sc
+ AlexToken inp' len act -> act pos (take len str) sc (\sc -> go inp' sc)
+
+-- NB. we add a final \n to the string, (see comment in the beginning of line
+-- production above).
+eofHack str = str++"\n"
+
+andBegin :: Action -> StartCode -> Action
+andBegin act new_sc = \pos str _ cont -> act pos str new_sc cont
+
+token :: Token -> Action
+token t = \pos _ sc cont -> (t, pos) : cont sc
+
+strtoken, strtokenNL :: (String -> Token) -> Action
+strtoken t = \pos str sc cont -> (t str, pos) : cont sc
+strtokenNL t = \pos str sc cont -> (t (filter (/= '\r') str), pos) : cont sc
+-- ^ We only want LF line endings in our internal doc string format, so we
+-- filter out all CRs.
+
+begin :: StartCode -> Action
+begin sc = \_ _ _ cont -> cont sc
+
+}
diff --git a/src/Text/Pandoc/Readers/Haddock/Parse.y b/src/Text/Pandoc/Readers/Haddock/Parse.y
new file mode 100644
index 000000000..065b9997f
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Haddock/Parse.y
@@ -0,0 +1,179 @@
+-- This code was copied from the 'haddock' package, modified, and integrated
+-- into Pandoc by David Lazar.
+{
+{-# LANGUAGE BangPatterns #-} -- required for versions of Happy before 1.18.6
+{-# OPTIONS -Wwarn -w #-}
+-- The above warning supression flag is a temporary kludge.
+-- While working on this module you are encouraged to remove it and fix
+-- any warnings in the module. See
+-- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
+-- for details
+
+module Text.Pandoc.Readers.Haddock.Parse (parseString, parseParas) where
+
+import Text.Pandoc.Readers.Haddock.Lex
+import Text.Pandoc.Builder
+import Data.Generics (everywhere, mkT)
+import Data.Char (isSpace)
+import Data.Maybe (fromMaybe)
+import Data.List (stripPrefix)
+import Data.Monoid (mempty)
+}
+
+%expect 0
+
+%tokentype { LToken }
+
+%token
+ '/' { (TokSpecial '/',_) }
+ '@' { (TokSpecial '@',_) }
+ '[' { (TokDefStart,_) }
+ ']' { (TokDefEnd,_) }
+ DQUO { (TokSpecial '\"',_) }
+ URL { (TokURL $$,_) }
+ PIC { (TokPic $$,_) }
+ ANAME { (TokAName $$,_) }
+ '/../' { (TokEmphasis $$,_) }
+ '-' { (TokBullet,_) }
+ '(n)' { (TokNumber,_) }
+ '>..' { (TokBirdTrack $$,_) }
+ PROP { (TokProperty $$,_) }
+ PROMPT { (TokExamplePrompt $$,_) }
+ RESULT { (TokExampleResult $$,_) }
+ EXP { (TokExampleExpression $$,_) }
+ IDENT { (TokIdent $$,_) }
+ PARA { (TokPara,_) }
+ STRING { (TokString $$,_) }
+
+%monad { Maybe }
+
+%name parseParas doc
+%name parseString seq
+
+%%
+
+doc :: { Blocks }
+ : apara PARA doc { $1 <> $3 }
+ | PARA doc { $2 }
+ | apara { $1 }
+ | {- empty -} { mempty }
+
+apara :: { Blocks }
+ : ulpara { bulletList [$1] }
+ | olpara { orderedList [$1] }
+ | defpara { definitionList [$1] }
+ | para { $1 }
+
+ulpara :: { Blocks }
+ : '-' para { $2 }
+
+olpara :: { Blocks }
+ : '(n)' para { $2 }
+
+defpara :: { (Inlines, [Blocks]) }
+ : '[' seq ']' seq { ($2, [plain $4]) }
+
+para :: { Blocks }
+ : seq { para $1 }
+ | codepara { codeBlock $1 }
+ | property { $1 }
+ | examples { $1 }
+
+codepara :: { String }
+ : '>..' codepara { $1 ++ $2 }
+ | '>..' { $1 }
+
+property :: { Blocks }
+ : PROP { makeProperty $1 }
+
+examples :: { Blocks }
+ : example examples { $1 <> $2 }
+ | example { $1 }
+
+example :: { Blocks }
+ : PROMPT EXP result { makeExample $1 $2 (lines $3) }
+ | PROMPT EXP { makeExample $1 $2 [] }
+
+result :: { String }
+ : RESULT result { $1 ++ $2 }
+ | RESULT { $1 }
+
+seq :: { Inlines }
+ : elem seq { $1 <> $2 }
+ | elem { $1 }
+
+elem :: { Inlines }
+ : elem1 { $1 }
+ | '@' seq1 '@' { monospace $2 }
+
+seq1 :: { Inlines }
+ : PARA seq1 { linebreak <> $2 }
+ | elem1 seq1 { $1 <> $2 }
+ | elem1 { $1 }
+
+elem1 :: { Inlines }
+ : STRING { str $1 }
+ | '/../' { emph (str $1) }
+ | URL { makeHyperlink $1 }
+ | PIC { image $1 $1 mempty }
+ | ANAME { mempty } -- TODO
+ | IDENT { code $1 }
+ | DQUO strings DQUO { code $2 }
+
+strings :: { String }
+ : STRING { $1 }
+ | STRING strings { $1 ++ $2 }
+
+{
+happyError :: [LToken] -> Maybe a
+happyError toks = Nothing
+
+monospace :: Inlines -> Inlines
+monospace = everywhere (mkT go)
+ where
+ go (Str s) = Code nullAttr s
+ go Space = Code nullAttr " "
+ go x = x
+
+-- | Create a `Hyperlink` from given string.
+--
+-- A hyperlink consists of a URL and an optional label. The label is separated
+-- from the url by one or more whitespace characters.
+makeHyperlink :: String -> Inlines
+makeHyperlink input = case break isSpace $ strip input of
+ (url, "") -> link url url (str url)
+ (url, lb) -> link url url (str label)
+ where label = dropWhile isSpace lb
+
+makeProperty :: String -> Blocks
+makeProperty s = case strip s of
+ 'p':'r':'o':'p':'>':xs ->
+ codeBlockWith ([], ["property"], []) (dropWhile isSpace xs)
+ xs ->
+ error $ "makeProperty: invalid input " ++ show xs
+
+-- | Create an 'Example', stripping superfluous characters as appropriate
+makeExample :: String -> String -> [String] -> Blocks
+makeExample prompt expression result =
+ para $ codeWith ([], ["expr"], []) (strip expression ++ "\n")
+ <> codeWith ([], ["result"], []) (unlines result')
+ where
+ -- 1. drop trailing whitespace from the prompt, remember the prefix
+ (prefix, _) = span isSpace prompt
+
+ -- 2. drop, if possible, the exact same sequence of whitespace
+ -- characters from each result line
+ --
+ -- 3. interpret lines that only contain the string "<BLANKLINE>" as an
+ -- empty line
+ result' = map (substituteBlankLine . tryStripPrefix prefix) result
+ where
+ tryStripPrefix xs ys = fromMaybe ys $ stripPrefix xs ys
+
+ substituteBlankLine "<BLANKLINE>" = ""
+ substituteBlankLine line = line
+
+-- | Remove all leading and trailing whitespace
+strip :: String -> String
+strip = dropWhile isSpace . reverse . dropWhile isSpace . reverse
+}