diff options
author | David Lazar <lazar6@illinois.edu> | 2013-03-28 14:53:10 -0700 |
---|---|---|
committer | David Lazar <lazar6@illinois.edu> | 2013-03-28 14:53:10 -0700 |
commit | 18459b95bab91b2a1d59e888045d555494d94bc8 (patch) | |
tree | 1974f9b339f37f9bdb90e309388ef5a436f7bcfe /src | |
parent | ee0fc19bc54208c5d8828eab872e3bbe303c47bf (diff) | |
download | pandoc-18459b95bab91b2a1d59e888045d555494d94bc8.tar.gz |
Add reader for Haddock markup based on Haddock's own lexer/parser.
Diffstat (limited to 'src')
-rw-r--r-- | src/Text/Pandoc.hs | 3 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Haddock.hs | 39 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Haddock/Lex.x | 169 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Haddock/Parse.y | 179 |
4 files changed, 390 insertions, 0 deletions
diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs index 89faf140c..cd2aa0fd3 100644 --- a/src/Text/Pandoc.hs +++ b/src/Text/Pandoc.hs @@ -73,6 +73,7 @@ module Text.Pandoc , readTextile , readDocBook , readOPML + , readHaddock , readNative -- * Writers: converting /from/ Pandoc format , Writer (..) @@ -120,6 +121,7 @@ import Text.Pandoc.Readers.LaTeX import Text.Pandoc.Readers.HTML import Text.Pandoc.Readers.Textile import Text.Pandoc.Readers.Native +import Text.Pandoc.Readers.Haddock import Text.Pandoc.Writers.Native import Text.Pandoc.Writers.Markdown import Text.Pandoc.Writers.RST @@ -200,6 +202,7 @@ readers = [("native" , \_ s -> return $ readNative s) ,("textile" , \o s -> return $ readTextile o s) -- TODO : textile+lhs ,("html" , \o s -> return $ readHtml o s) ,("latex" , \o s -> return $ readLaTeX o s) + ,("haddock" , \o s -> return $ readHaddock o s) ] data Writer = PureStringWriter (WriterOptions -> Pandoc -> String) diff --git a/src/Text/Pandoc/Readers/Haddock.hs b/src/Text/Pandoc/Readers/Haddock.hs new file mode 100644 index 000000000..49154b0ca --- /dev/null +++ b/src/Text/Pandoc/Readers/Haddock.hs @@ -0,0 +1,39 @@ +{- | + Module : Text.Pandoc.Readers.Haddock + Copyright : Copyright (C) 2013 David Lazar + License : GNU GPL, version 2 or above + + Maintainer : David Lazar <lazar6@illinois.edu> + Stability : alpha + +Conversion of Haddock markup to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Haddock + ( readHaddock + ) where + +import Text.Pandoc.Builder +import Text.Pandoc.Options +import Text.Pandoc.Readers.Haddock.Lex +import Text.Pandoc.Readers.Haddock.Parse + +-- | Parse Haddock markup and return a 'Pandoc' document. +readHaddock :: ReaderOptions -- ^ Reader options + -> String -- ^ String to parse + -> Pandoc +readHaddock _ s = Pandoc (Meta [] [] []) blocks + where + blocks = case parseParas (tokenise s (0,0)) of + Nothing -> [] + Just x -> mergeLists (toList x) + +-- similar to 'docAppend' in Haddock.Doc +mergeLists :: [Block] -> [Block] +mergeLists (BulletList xs : BulletList ys : blocks) + = mergeLists (BulletList (xs ++ ys) : blocks) +mergeLists (OrderedList _ xs : OrderedList a ys : blocks) + = mergeLists (OrderedList a (xs ++ ys) : blocks) +mergeLists (DefinitionList xs : DefinitionList ys : blocks) + = mergeLists (DefinitionList (xs ++ ys) : blocks) +mergeLists (x : blocks) = x : mergeLists blocks +mergeLists [] = [] diff --git a/src/Text/Pandoc/Readers/Haddock/Lex.x b/src/Text/Pandoc/Readers/Haddock/Lex.x new file mode 100644 index 000000000..902ac84c0 --- /dev/null +++ b/src/Text/Pandoc/Readers/Haddock/Lex.x @@ -0,0 +1,169 @@ +-- +-- Haddock - A Haskell Documentation Tool +-- +-- (c) Simon Marlow 2002 +-- +-- This file was modified and integrated into GHC by David Waern 2006. +-- Then moved back into Haddock by Isaac Dupree in 2009 :-) +-- Then copied into Pandoc by David Lazar in 2013 :-D + +{ +{-# LANGUAGE BangPatterns #-} -- Generated by Alex +{-# OPTIONS -Wwarn -w #-} +-- The above warning supression flag is a temporary kludge. +-- While working on this module you are encouraged to remove it and fix +-- any warnings in the module. See +-- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings +-- for details + +module Text.Pandoc.Readers.Haddock.Lex ( + Token(..), + LToken, + tokenise + ) where + +import Data.Char +import Numeric (readHex) +} + +%wrapper "posn" + +$ws = $white # \n +$digit = [0-9] +$hexdigit = [0-9a-fA-F] +$special = [\"\@] +$alphanum = [A-Za-z0-9] +$ident = [$alphanum \'\_\.\!\#\$\%\&\*\+\/\<\=\>\?\@\\\\\^\|\-\~\:] + +:- + +-- beginning of a paragraph +<0,para> { + $ws* \n ; + $ws* \> { begin birdtrack } + $ws* prop \> .* \n { strtoken TokProperty `andBegin` property} + $ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr } + $ws* [\*\-] { token TokBullet `andBegin` string } + $ws* \[ { token TokDefStart `andBegin` def } + $ws* \( $digit+ \) { token TokNumber `andBegin` string } + $ws* $digit+ \. { token TokNumber `andBegin` string } + $ws* { begin string } +} + +-- beginning of a line +<line> { + $ws* \> { begin birdtrack } + $ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr } + $ws* \n { token TokPara `andBegin` para } + + -- Here, we really want to be able to say + -- $ws* (\n | <eof>) { token TokPara `andBegin` para} + -- because otherwise a trailing line of whitespace will result in + -- a spurious TokString at the end of a docstring. We don't have <eof>, + -- though (NOW I realise what it was for :-). To get around this, we always + -- append \n to the end of a docstring. + () { begin string } +} + +<birdtrack> .* \n? { strtokenNL TokBirdTrack `andBegin` line } + +<property> () { token TokPara `andBegin` para } + +<example> { + $ws* \n { token TokPara `andBegin` para } + $ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr } + () { begin exampleresult } +} + +<exampleexpr> .* \n { strtokenNL TokExampleExpression `andBegin` example } + +<exampleresult> .* \n { strtokenNL TokExampleResult `andBegin` example } + +<string,def> { + $special { strtoken $ \s -> TokSpecial (head s) } + \<\< [^\>]* \>\> { strtoken $ \s -> TokPic (init $ init $ tail $ tail s) } + \< [^\>]* \> { strtoken $ \s -> TokURL (init (tail s)) } + \# [^\#]* \# { strtoken $ \s -> TokAName (init (tail s)) } + \/ [^\/]* \/ { strtoken $ \s -> TokEmphasis (init (tail s)) } + [\'\`] $ident+ [\'\`] { strtoken $ \s -> TokIdent (init (tail s)) } + \\ . { strtoken (TokString . tail) } + "&#" $digit+ \; { strtoken $ \s -> TokString [chr (read (init (drop 2 s)))] } + "&#" [xX] $hexdigit+ \; + { strtoken $ \s -> case readHex (init (drop 3 s)) of [(n,_)] -> TokString [chr n] } + -- allow special characters through if they don't fit one of the previous + -- patterns. + [\/\'\`\<\#\&\\] { strtoken TokString } + [^ $special \/ \< \# \n \'\` \& \\ \]]* \n { strtokenNL TokString `andBegin` line } + [^ $special \/ \< \# \n \'\` \& \\ \]]+ { strtoken TokString } +} + +<def> { + \] { token TokDefEnd `andBegin` string } +} + +-- ']' doesn't have any special meaning outside of the [...] at the beginning +-- of a definition paragraph. +<string> { + \] { strtoken TokString } +} + +{ +-- | A located token +type LToken = (Token, AlexPosn) + +data Token + = TokPara + | TokNumber + | TokBullet + | TokDefStart + | TokDefEnd + | TokSpecial Char + | TokIdent String + | TokString String + | TokURL String + | TokPic String + | TokEmphasis String + | TokAName String + | TokBirdTrack String + | TokProperty String + | TokExamplePrompt String + | TokExampleExpression String + | TokExampleResult String +-- deriving Show + +tokenPos :: LToken -> (Int, Int) +tokenPos t = let AlexPn _ line col = snd t in (line, col) + +type StartCode = Int +type Action = AlexPosn -> String -> StartCode -> (StartCode -> [LToken]) -> [LToken] + +tokenise :: String -> (Int, Int) -> [LToken] +tokenise str (line, col) = go (posn,'\n',[],eofHack str) para + where posn = AlexPn 0 line col + go inp@(pos,_,_,str) sc = + case alexScan inp sc of + AlexEOF -> [] + AlexError _ -> [] + AlexSkip inp' len -> go inp' sc + AlexToken inp' len act -> act pos (take len str) sc (\sc -> go inp' sc) + +-- NB. we add a final \n to the string, (see comment in the beginning of line +-- production above). +eofHack str = str++"\n" + +andBegin :: Action -> StartCode -> Action +andBegin act new_sc = \pos str _ cont -> act pos str new_sc cont + +token :: Token -> Action +token t = \pos _ sc cont -> (t, pos) : cont sc + +strtoken, strtokenNL :: (String -> Token) -> Action +strtoken t = \pos str sc cont -> (t str, pos) : cont sc +strtokenNL t = \pos str sc cont -> (t (filter (/= '\r') str), pos) : cont sc +-- ^ We only want LF line endings in our internal doc string format, so we +-- filter out all CRs. + +begin :: StartCode -> Action +begin sc = \_ _ _ cont -> cont sc + +} diff --git a/src/Text/Pandoc/Readers/Haddock/Parse.y b/src/Text/Pandoc/Readers/Haddock/Parse.y new file mode 100644 index 000000000..065b9997f --- /dev/null +++ b/src/Text/Pandoc/Readers/Haddock/Parse.y @@ -0,0 +1,179 @@ +-- This code was copied from the 'haddock' package, modified, and integrated +-- into Pandoc by David Lazar. +{ +{-# LANGUAGE BangPatterns #-} -- required for versions of Happy before 1.18.6 +{-# OPTIONS -Wwarn -w #-} +-- The above warning supression flag is a temporary kludge. +-- While working on this module you are encouraged to remove it and fix +-- any warnings in the module. See +-- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings +-- for details + +module Text.Pandoc.Readers.Haddock.Parse (parseString, parseParas) where + +import Text.Pandoc.Readers.Haddock.Lex +import Text.Pandoc.Builder +import Data.Generics (everywhere, mkT) +import Data.Char (isSpace) +import Data.Maybe (fromMaybe) +import Data.List (stripPrefix) +import Data.Monoid (mempty) +} + +%expect 0 + +%tokentype { LToken } + +%token + '/' { (TokSpecial '/',_) } + '@' { (TokSpecial '@',_) } + '[' { (TokDefStart,_) } + ']' { (TokDefEnd,_) } + DQUO { (TokSpecial '\"',_) } + URL { (TokURL $$,_) } + PIC { (TokPic $$,_) } + ANAME { (TokAName $$,_) } + '/../' { (TokEmphasis $$,_) } + '-' { (TokBullet,_) } + '(n)' { (TokNumber,_) } + '>..' { (TokBirdTrack $$,_) } + PROP { (TokProperty $$,_) } + PROMPT { (TokExamplePrompt $$,_) } + RESULT { (TokExampleResult $$,_) } + EXP { (TokExampleExpression $$,_) } + IDENT { (TokIdent $$,_) } + PARA { (TokPara,_) } + STRING { (TokString $$,_) } + +%monad { Maybe } + +%name parseParas doc +%name parseString seq + +%% + +doc :: { Blocks } + : apara PARA doc { $1 <> $3 } + | PARA doc { $2 } + | apara { $1 } + | {- empty -} { mempty } + +apara :: { Blocks } + : ulpara { bulletList [$1] } + | olpara { orderedList [$1] } + | defpara { definitionList [$1] } + | para { $1 } + +ulpara :: { Blocks } + : '-' para { $2 } + +olpara :: { Blocks } + : '(n)' para { $2 } + +defpara :: { (Inlines, [Blocks]) } + : '[' seq ']' seq { ($2, [plain $4]) } + +para :: { Blocks } + : seq { para $1 } + | codepara { codeBlock $1 } + | property { $1 } + | examples { $1 } + +codepara :: { String } + : '>..' codepara { $1 ++ $2 } + | '>..' { $1 } + +property :: { Blocks } + : PROP { makeProperty $1 } + +examples :: { Blocks } + : example examples { $1 <> $2 } + | example { $1 } + +example :: { Blocks } + : PROMPT EXP result { makeExample $1 $2 (lines $3) } + | PROMPT EXP { makeExample $1 $2 [] } + +result :: { String } + : RESULT result { $1 ++ $2 } + | RESULT { $1 } + +seq :: { Inlines } + : elem seq { $1 <> $2 } + | elem { $1 } + +elem :: { Inlines } + : elem1 { $1 } + | '@' seq1 '@' { monospace $2 } + +seq1 :: { Inlines } + : PARA seq1 { linebreak <> $2 } + | elem1 seq1 { $1 <> $2 } + | elem1 { $1 } + +elem1 :: { Inlines } + : STRING { str $1 } + | '/../' { emph (str $1) } + | URL { makeHyperlink $1 } + | PIC { image $1 $1 mempty } + | ANAME { mempty } -- TODO + | IDENT { code $1 } + | DQUO strings DQUO { code $2 } + +strings :: { String } + : STRING { $1 } + | STRING strings { $1 ++ $2 } + +{ +happyError :: [LToken] -> Maybe a +happyError toks = Nothing + +monospace :: Inlines -> Inlines +monospace = everywhere (mkT go) + where + go (Str s) = Code nullAttr s + go Space = Code nullAttr " " + go x = x + +-- | Create a `Hyperlink` from given string. +-- +-- A hyperlink consists of a URL and an optional label. The label is separated +-- from the url by one or more whitespace characters. +makeHyperlink :: String -> Inlines +makeHyperlink input = case break isSpace $ strip input of + (url, "") -> link url url (str url) + (url, lb) -> link url url (str label) + where label = dropWhile isSpace lb + +makeProperty :: String -> Blocks +makeProperty s = case strip s of + 'p':'r':'o':'p':'>':xs -> + codeBlockWith ([], ["property"], []) (dropWhile isSpace xs) + xs -> + error $ "makeProperty: invalid input " ++ show xs + +-- | Create an 'Example', stripping superfluous characters as appropriate +makeExample :: String -> String -> [String] -> Blocks +makeExample prompt expression result = + para $ codeWith ([], ["expr"], []) (strip expression ++ "\n") + <> codeWith ([], ["result"], []) (unlines result') + where + -- 1. drop trailing whitespace from the prompt, remember the prefix + (prefix, _) = span isSpace prompt + + -- 2. drop, if possible, the exact same sequence of whitespace + -- characters from each result line + -- + -- 3. interpret lines that only contain the string "<BLANKLINE>" as an + -- empty line + result' = map (substituteBlankLine . tryStripPrefix prefix) result + where + tryStripPrefix xs ys = fromMaybe ys $ stripPrefix xs ys + + substituteBlankLine "<BLANKLINE>" = "" + substituteBlankLine line = line + +-- | Remove all leading and trailing whitespace +strip :: String -> String +strip = dropWhile isSpace . reverse . dropWhile isSpace . reverse +} |