5 files changed, 207 insertions, 42 deletions
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
index 99e6f99e6..48a512be2 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -73,6 +73,7 @@ import Text.TeXMath (Exp)
 import Text.TeXMath.Readers.OMML (readOMML)
 import Text.TeXMath.Unicode.Fonts (Font (..), getUnicode, stringToFont)
 import Text.XML.Light
+import qualified Text.XML.Light.Cursor as XMLC
 
 data ReaderEnv = ReaderEnv { envNotes         :: Notes
                            , envComments      :: Comments
@@ -117,6 +118,32 @@ mapD f xs =
   in
    concatMapM handler xs
 
+unwrapSDT :: NameSpaces -> Content -> Content
+unwrapSDT ns (Elem element)
+  | isElem ns "w" "sdt" element
+  , Just sdtContent <- findChildByName ns "w" "sdtContent" element
+  , child : _    <- elChildren sdtContent
+  = Elem child
+unwrapSDT _ content = content
+
+walkDocument' :: NameSpaces -> XMLC.Cursor -> XMLC.Cursor
+walkDocument' ns cur =
+  let modifiedCur = XMLC.modifyContent (unwrapSDT ns) cur
+  in
+    case XMLC.nextDF modifiedCur of
+      Just cur' -> walkDocument' ns cur'
+      Nothing   -> XMLC.root modifiedCur
+
+walkDocument :: NameSpaces -> Element -> Maybe Element
+walkDocument ns element =
+  let cur = XMLC.fromContent (Elem element)
+      cur' = walkDocument' ns cur
+  in
+    case XMLC.toTree cur' of
+      Elem element' -> Just element'
+      _             -> Nothing
+
+
 data Docx = Docx Document
           deriving Show
 
@@ -298,7 +325,10 @@ archiveToDocument zf = do
   docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
   let namespaces = elemToNameSpaces docElem
   bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem
-  body <- elemToBody namespaces bodyElem
+  let bodyElem' = case walkDocument namespaces bodyElem of
+        Just e -> e
+        Nothing -> bodyElem
+  body <- elemToBody namespaces bodyElem'
   return $ Document namespaces body
 
 elemToBody :: NameSpaces -> Element -> D Body
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 3e59c4bf7..05a80335a 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -51,7 +51,7 @@ import Data.Char (isAlphaNum, isDigit, isLetter)
 import Data.Default (Default (..), def)
 import Data.Foldable (for_)
 import Data.List (intercalate, isPrefixOf)
-import Data.List.Split (wordsBy)
+import Data.List.Split (wordsBy, splitWhen)
 import qualified Data.Map as M
 import Data.Maybe (fromMaybe, isJust, isNothing)
 import Data.Monoid (First (..), (<>))
@@ -66,6 +66,7 @@ import qualified Text.Pandoc.Builder as B
 import Text.Pandoc.Class (PandocMonad (..))
 import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)
 import Text.Pandoc.Definition
+import Text.Pandoc.Extensions (Extension(..))
 import Text.Pandoc.Error
 import Text.Pandoc.Logging
 import Text.Pandoc.Options (
@@ -191,6 +192,7 @@ block = do
             , pHtml
             , pHead
             , pBody
+            , pLineBlock
             , pDiv
             , pPlain
             , pFigure
@@ -377,6 +379,16 @@ pRawTag = do
      then return mempty
      else return $ renderTags' [tag]
 
+pLineBlock :: PandocMonad m => TagParser m Blocks
+pLineBlock = try $ do
+  guardEnabled Ext_line_blocks
+  _ <- pSatisfy $ tagOpen (=="div") (== [("class","line-block")])
+  ils <- trimInlines . mconcat <$> manyTill inline (pSatisfy (tagClose (=="div")))
+  let lns = map B.fromList $
+            splitWhen (== LineBreak) $ filter (/= SoftBreak) $
+            B.toList ils
+  return $ B.lineBlock lns
+
 pDiv :: PandocMonad m => TagParser m Blocks
 pDiv = try $ do
   guardEnabled Ext_native_divs
diff --git a/src/Text/Pandoc/Readers/JATS.hs b/src/Text/Pandoc/Readers/JATS.hs
index 851fbec35..9223db68c 100644
--- a/src/Text/Pandoc/Readers/JATS.hs
+++ b/src/Text/Pandoc/Readers/JATS.hs
@@ -5,6 +5,7 @@ import Data.Char (isDigit, isSpace, toUpper)
 import Data.Default
 import Data.Generics
 import Data.List (intersperse)
+import qualified Data.Map as Map
 import Data.Maybe (maybeToList, fromMaybe)
 import Data.Text (Text)
 import qualified Data.Text as T
@@ -23,7 +24,6 @@ type JATS m = StateT JATSState m
 data JATSState = JATSState{ jatsSectionLevel :: Int
                           , jatsQuoteType    :: QuoteType
                           , jatsMeta         :: Meta
-                          , jatsAcceptsMeta  :: Bool
                           , jatsBook         :: Bool
                           , jatsFigureTitle  :: Inlines
                           , jatsContent      :: [Content]
@@ -33,7 +33,6 @@ instance Default JATSState where
   def = JATSState{ jatsSectionLevel = 0
                  , jatsQuoteType = DoubleQuote
                  , jatsMeta = mempty
-                 , jatsAcceptsMeta = False
                  , jatsBook = False
                  , jatsFigureTitle = mempty
                  , jatsContent = [] }
@@ -79,19 +78,6 @@ named s e = qName (elName e) == s
 
 --
 
-acceptingMetadata :: PandocMonad m => JATS m a -> JATS m a
-acceptingMetadata p = do
-  modify (\s -> s { jatsAcceptsMeta = True } )
-  res <- p
-  modify (\s -> s { jatsAcceptsMeta = False })
-  return res
-
-checkInMeta :: (PandocMonad m, Monoid a) => JATS m () -> JATS m a
-checkInMeta p = do
-  accepts <- jatsAcceptsMeta <$> get
-  when accepts p
-  return mempty
-
 addMeta :: PandocMonad m => ToMetaValue a => String -> a -> JATS m ()
 addMeta field val = modify (setMeta field val)
 
@@ -179,18 +165,16 @@ parseBlock (Elem e) =
                         <$> listitems
         "def-list" -> definitionList <$> deflistitems
         "sec" -> gets jatsSectionLevel >>= sect . (+1)
-        "title" -> return mempty
-        "title-group" -> checkInMeta getTitle
         "graphic" -> para <$> getGraphic e
-        "journal-meta" -> metaBlock
-        "article-meta" -> metaBlock
-        "custom-meta" -> metaBlock
+        "journal-meta" -> parseMetadata e
+        "article-meta" -> parseMetadata e
+        "custom-meta" -> parseMetadata e
+        "title" -> return mempty -- processed by header
         "table" -> parseTable
         "fig" -> divWith (attrValue "id" e, ["fig"], []) <$> getBlocks e
         "table-wrap" -> divWith (attrValue "id" e, ["table-wrap"], []) <$> getBlocks e
         "caption" -> divWith (attrValue "id" e, ["caption"], []) <$> sect 6
-        "ref-list" -> divWith ("refs", [], []) <$> getBlocks e
-        "ref" -> divWith ("ref-" <> attrValue "id" e, [], []) <$> getBlocks e
+        "ref-list" -> parseRefList e
         "?xml"  -> return mempty
         _       -> getBlocks e
    where parseMixed container conts = do
@@ -231,16 +215,6 @@ parseBlock (Elem e) =
                      terms' <- mapM getInlines terms
                      items' <- mapM getBlocks items
                      return (mconcat $ intersperse (str "; ") terms', items')
-         getTitle =  do
-                     tit <-  case filterChild (named "article-title") e of
-                                  Just s  -> getInlines s
-                                  Nothing -> return mempty
-                     subtit <-  case filterChild (named "subtitle") e of
-                                  Just s  -> (text ": " <>) <$>
-                                              getInlines s
-                                  Nothing -> return mempty
-                     addMeta "title" (tit <> subtit)
-
          parseTable = do
                       let isCaption x = named "title" x || named "caption" x
                       caption <- case filterChild isCaption e of
@@ -305,13 +279,127 @@ parseBlock (Elem e) =
                      let ident = attrValue "id" e
                      modify $ \st -> st{ jatsSectionLevel = oldN }
                      return $ headerWith (ident,[],[]) n' headerText <> b
---         lineItems = mapM getInlines $ filterChildren (named "line") e
-         metaBlock = acceptingMetadata (getBlocks e) >> return mempty
 
 getInlines :: PandocMonad m => Element -> JATS m Inlines
 getInlines e' = (trimInlines . mconcat) <$>
                  mapM parseInline (elContent e')
 
+parseMetadata :: PandocMonad m => Element -> JATS m Blocks
+parseMetadata e = do
+  getTitle e
+  getAuthors e
+  getAffiliations e
+  return mempty
+
+getTitle :: PandocMonad m => Element -> JATS m ()
+getTitle e = do
+  tit <-  case filterElement (named "article-title") e of
+               Just s  -> getInlines s
+               Nothing -> return mempty
+  subtit <-  case filterElement (named "subtitle") e of
+               Just s  -> (text ": " <>) <$>
+                           getInlines s
+               Nothing -> return mempty
+  when (tit /= mempty) $ addMeta "title" tit
+  when (subtit /= mempty) $ addMeta "subtitle" subtit
+
+getAuthors :: PandocMonad m => Element -> JATS m ()
+getAuthors e = do
+  authors <- mapM getContrib $ filterElements
+              (\x -> named "contrib" x &&
+                     attrValue "contrib-type" x == "author") e
+  authorNotes <- mapM getInlines $ filterElements (named "author-notes") e
+  let authors' = case (reverse authors, authorNotes) of
+                   ([], _)    -> []
+                   (_, [])    -> authors
+                   (a:as, ns) -> reverse as ++ [a <> mconcat ns]
+  unless (null authors) $ addMeta "author" authors'
+
+getAffiliations :: PandocMonad m => Element -> JATS m ()
+getAffiliations x = do
+  affs <- mapM getInlines $ filterChildren (named "aff") x
+  unless (null affs) $ addMeta "institute" affs
+
+getContrib :: PandocMonad m => Element -> JATS m Inlines
+getContrib x = do
+  given <- maybe (return mempty) getInlines
+            $ filterElement (named "given-names") x
+  family <- maybe (return mempty) getInlines
+            $ filterElement (named "surname") x
+  if given == mempty && family == mempty
+     then return mempty
+     else if given == mempty || family == mempty
+          then return $ given <> family
+          else return $ given <> space <> family
+
+parseRefList :: PandocMonad m => Element -> JATS m Blocks
+parseRefList e = do
+  refs <- mapM parseRef $ filterChildren (named "ref") e
+  addMeta "references" refs
+  return mempty
+
+parseRef :: PandocMonad m
+         => Element -> JATS m (Map.Map String MetaValue)
+parseRef e = do
+  let refId = text $ attrValue "id" e
+  let getInlineText n = maybe (return mempty) getInlines . filterChild (named n)
+  case filterChild (named "element-citation") e of
+       Just c  -> do
+         let refType = text $
+               case attrValue "publication-type" c of
+                  "journal" -> "article-journal"
+                  x -> x
+         (refTitle, refContainerTitle) <- do
+           t <- getInlineText "article-title" c
+           ct <- getInlineText "source" c
+           if t == mempty
+              then return (ct, mempty)
+              else return (t, ct)
+         refLabel <- getInlineText "label" c
+         refYear <- getInlineText "year" c
+         refVolume <- getInlineText "volume" c
+         refFirstPage <- getInlineText "fpage" c
+         refLastPage <- getInlineText "lpage" c
+         refPublisher <- getInlineText "publisher-name" c
+         refPublisherPlace <- getInlineText "publisher-loc" c
+         let refPages = refFirstPage <> (if refLastPage == mempty
+                                            then mempty
+                                            else text "\x2013" <> refLastPage)
+         let personGroups' = filterChildren (named "person-group") c
+         let getName nm = do
+               given <- maybe (return mempty) getInlines
+                         $ filterChild (named "given-names") nm
+               family <- maybe (return mempty) getInlines
+                         $ filterChild (named "surname") nm
+               return $ toMetaValue $ Map.fromList [
+                   ("given", given)
+                 , ("family", family)
+                 ]
+         personGroups <- mapM (\pg ->
+                                do names <- mapM getName
+                                            (filterChildren (named "name") pg)
+                                   return (attrValue "person-group-type" pg,
+                                           toMetaValue names))
+                         personGroups'
+         return $ Map.fromList $
+           [ ("id", toMetaValue refId)
+           , ("type", toMetaValue refType)
+           , ("title", toMetaValue refTitle)
+           , ("container-title", toMetaValue refContainerTitle)
+           , ("publisher", toMetaValue refPublisher)
+           , ("publisher-place", toMetaValue refPublisherPlace)
+           , ("title", toMetaValue refTitle)
+           , ("issued", toMetaValue
+                        $ Map.fromList [
+                            ("year", refYear)
+                          ])
+           , ("volume", toMetaValue refVolume)
+           , ("page", toMetaValue refPages)
+           , ("citation-label", toMetaValue refLabel)
+           ] ++ personGroups
+       Nothing -> return $ Map.insert "id" (toMetaValue refId) mempty
+       -- TODO handle mixed-citation
+
 strContentRecursive :: Element -> String
 strContentRecursive = strContent .
   (\e' -> e'{ elContent = map elementToStr $ elContent e' })
@@ -354,7 +442,15 @@ parseInline (Elem e) =
             let rid = attrValue "rid" e
             let refType = ("ref-type",) <$> maybeAttrValue "ref-type" e
             let attr = (attrValue "id" e, [], maybeToList refType)
-            return $ linkWith attr ('#' : rid) "" ils
+            return $ if refType == Just ("ref-type","bibr")
+                        then cite [Citation{
+                                         citationId = rid
+                                       , citationPrefix = []
+                                       , citationSuffix = []
+                                       , citationMode = NormalCitation
+                                       , citationNoteNum = 0
+                                       , citationHash = 0}] ils
+                        else linkWith attr ('#' : rid) "" ils
         "ext-link" -> do
              ils <- innerInlines
              let title = fromMaybe "" $ findAttr (QName "title" (Just "http://www.w3.org/1999/xlink") Nothing) e
@@ -375,9 +471,6 @@ parseInline (Elem e) =
         "uri" -> return $ link (strContent e) "" $ str $ strContent e
         "fn" -> (note . mconcat) <$>
                          mapM parseBlock (elContent e)
-        -- Note: this isn't a real docbook tag; it's what we convert
-        -- <?asciidor-br?> to in handleInstructions, above.  A kludge to
-        -- work around xml-light's inability to parse an instruction.
         _          -> innerInlines
    where innerInlines = (trimInlines . mconcat) <$>
                           mapM parseInline (elContent e)
diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs
index f7e45e01a..6c5567ffd 100644
--- a/src/Text/Pandoc/Readers/LaTeX.hs
+++ b/src/Text/Pandoc/Readers/LaTeX.hs
@@ -1489,8 +1489,17 @@ inlineCommands = M.union inlineLanguageCommands $ M.fromList $
   -- biblatex misc
   , ("RN", romanNumeralUpper)
   , ("Rn", romanNumeralLower)
+  -- babel
+  , ("foreignlanguage", foreignlanguage)
   ]
 
+foreignlanguage :: PandocMonad m => LP m Inlines
+foreignlanguage = do
+  babelLang <- T.unpack . untokenize <$> braced
+  case babelLangToBCP47 babelLang of
+       Just lang -> spanWith ("", [], [("lang", renderLang $ lang)]) <$> tok
+       _ -> tok
+
 inlineLanguageCommands :: PandocMonad m => M.Map Text (LP m Inlines)
 inlineLanguageCommands = M.fromList $ mk <$> M.toList polyglossiaLangToBCP47
   where
@@ -2655,3 +2664,24 @@ polyglossiaLangToBCP47 = M.fromList
   , ("urdu", \_ -> Lang "ur" "" "" [])
   , ("vietnamese", \_ -> Lang "vi" "" "" [])
   ]
+
+babelLangToBCP47 :: String -> Maybe Lang
+babelLangToBCP47 s =
+  case s of
+       "austrian" -> Just $ Lang "de" "" "AT" ["1901"]
+       "naustrian" -> Just $ Lang "de" "" "AT" []
+       "swissgerman" -> Just $ Lang "de" "" "CH" ["1901"]
+       "nswissgerman" -> Just $ Lang "de" "" "CH" []
+       "german" -> Just $ Lang "de" "" "DE" ["1901"]
+       "ngerman" -> Just $ Lang "de" "" "DE" []
+       "lowersorbian" -> Just $ Lang "dsb" "" "" []
+       "uppersorbian" -> Just $ Lang "hsb" "" "" []
+       "polutonikogreek" -> Just $ Lang "el" "" "" ["polyton"]
+       "slovene" -> Just $ Lang "sl" "" "" []
+       "australian" -> Just $ Lang "en" "" "AU" []
+       "canadian" -> Just $ Lang "en" "" "CA" []
+       "british" -> Just $ Lang "en" "" "GB" []
+       "newzealand" -> Just $ Lang "en" "" "NZ" []
+       "american" -> Just $ Lang "en" "" "US" []
+       "classiclatin" -> Just $ Lang "la" "" "" ["x-classic"]
+       _ -> fmap ($ "") $ M.lookup s polyglossiaLangToBCP47
diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs
index 6b5d0a331..9f259d958 100644
--- a/src/Text/Pandoc/Readers/RST.hs
+++ b/src/Text/Pandoc/Readers/RST.hs
@@ -547,7 +547,7 @@ bulletListStart :: Monad m => ParserT [Char] st m Int
 bulletListStart = try $ do
   notFollowedBy' hrule  -- because hrules start out just like lists
   marker <- oneOf bulletListMarkers
-  white <- many1 spaceChar
+  white <- many1 spaceChar <|> "" <$ lookAhead (char '\n')
   return $ length (marker:white)
 
 -- parses ordered list start and returns its length (inc following whitespace)
@@ -556,7 +556,7 @@ orderedListStart :: Monad m => ListNumberStyle
                  -> RSTParser m Int
 orderedListStart style delim = try $ do
   (_, markerLen) <- withHorizDisplacement (orderedListMarker style delim)
-  white <- many1 spaceChar
+  white <- many1 spaceChar <|> "" <$ lookAhead (char '\n')
   return $ markerLen + length white
 
 -- parse a line of a list item