aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Citeproc/Locator.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Citeproc/Locator.hs')
-rw-r--r--src/Text/Pandoc/Citeproc/Locator.hs279
1 files changed, 279 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Citeproc/Locator.hs b/src/Text/Pandoc/Citeproc/Locator.hs
new file mode 100644
index 000000000..dba762c02
--- /dev/null
+++ b/src/Text/Pandoc/Citeproc/Locator.hs
@@ -0,0 +1,279 @@
+{-# LANGUAGE TupleSections #-}
+{-# LANGUAGE ViewPatterns #-}
+{-# LANGUAGE OverloadedStrings #-}
+module Text.Pandoc.Citeproc.Locator
+ ( parseLocator )
+where
+import Citeproc.Types
+import Data.Text (Text)
+import qualified Data.Text as T
+import Text.Parsec
+import Text.Pandoc.Definition
+import Text.Pandoc.Parsing (romanNumeral)
+import Text.Pandoc.Shared (stringify)
+import Control.Monad (mzero)
+import qualified Data.Map as M
+import Data.Char (isSpace, isPunctuation, isDigit)
+
+parseLocator :: Locale -> [Inline] -> (Maybe (Text, Text), [Inline])
+parseLocator locale inp =
+ case parse (pLocatorWords (toLocatorMap locale)) "suffix" $ splitInp inp of
+ Right r -> r
+ Left _ -> (Nothing, inp)
+
+splitInp :: [Inline] -> [Inline]
+splitInp = splitStrWhen (\c -> isSpace c || (isPunctuation c && c /= ':'))
+
+--
+-- Locator parsing
+--
+
+type LocatorParser = Parsec [Inline] ()
+
+pLocatorWords :: LocatorMap
+ -> LocatorParser (Maybe (Text, Text), [Inline])
+pLocatorWords locMap = do
+ optional $ pMatchChar "," (== ',')
+ optional pSpace
+ (la, lo) <- pLocatorDelimited locMap <|> pLocatorIntegrated locMap
+ s <- getInput -- rest is suffix
+ -- need to trim, otherwise "p. 9" and "9" will have 'different' locators later on
+ -- i.e. the first one will be " 9"
+ return $
+ if T.null la && T.null lo
+ then (Nothing, s)
+ else (Just (la, T.strip lo), s)
+
+pLocatorDelimited :: LocatorMap -> LocatorParser (Text, Text)
+pLocatorDelimited locMap = try $ do
+ _ <- pMatchChar "{" (== '{')
+ skipMany pSpace -- gobble pre-spaces so label doesn't try to include them
+ (la, _) <- pLocatorLabelDelimited locMap
+ -- we only care about balancing {} and [] (because of the outer [] scope);
+ -- the rest can be anything
+ let inner = do { t <- anyToken; return (True, stringify t) }
+ gs <- many (pBalancedBraces [('{','}'), ('[',']')] inner)
+ _ <- pMatchChar "}" (== '}')
+ let lo = T.concat $ map snd gs
+ return (la, lo)
+
+pLocatorLabelDelimited :: LocatorMap -> LocatorParser (Text, Bool)
+pLocatorLabelDelimited locMap
+ = pLocatorLabel' locMap lim <|> return ("page", True)
+ where
+ lim = stringify <$> anyToken
+
+pLocatorIntegrated :: LocatorMap -> LocatorParser (Text, Text)
+pLocatorIntegrated locMap = try $ do
+ (la, wasImplicit) <- pLocatorLabelIntegrated locMap
+ -- if we got the label implicitly, we have presupposed the first one is
+ -- going to have a digit, so guarantee that. You _can_ have p. (a)
+ -- because you specified it.
+ let modifier = if wasImplicit
+ then requireDigits
+ else requireRomansOrDigits
+ g <- try $ pLocatorWordIntegrated (not wasImplicit) >>= modifier
+ gs <- many (try $ pLocatorWordIntegrated False >>= modifier)
+ let lo = T.concat (g:gs)
+ return (la, lo)
+
+pLocatorLabelIntegrated :: LocatorMap -> LocatorParser (Text, Bool)
+pLocatorLabelIntegrated locMap
+ = pLocatorLabel' locMap lim <|> (lookAhead digital >> return ("page", True))
+ where
+ lim = try $ pLocatorWordIntegrated True >>= requireRomansOrDigits
+ digital = try $ pLocatorWordIntegrated True >>= requireDigits
+
+pLocatorLabel' :: LocatorMap -> LocatorParser Text
+ -> LocatorParser (Text, Bool)
+pLocatorLabel' locMap lim = go ""
+ where
+ -- grow the match string until we hit the end
+ -- trying to find the largest match for a label
+ go acc = try $ do
+ -- advance at least one token each time
+ -- the pathological case is "p.3"
+ t <- anyToken
+ ts <- manyTill anyToken (try $ lookAhead lim)
+ let s = acc <> stringify (t:ts)
+ case M.lookup (T.strip s) locMap of
+ -- try to find a longer one, or return this one
+ Just l -> go s <|> return (l, False)
+ Nothing -> go s
+
+-- hard requirement for a locator to have some real digits in it
+requireDigits :: (Bool, Text) -> LocatorParser Text
+requireDigits (_, s) = if not (T.any isDigit s)
+ then Prelude.fail "requireDigits"
+ else return s
+
+-- soft requirement for a sequence with some roman or arabic parts
+-- (a)(iv) -- because iv is roman
+-- 1(a) -- because 1 is an actual digit
+-- NOT: a, (a)-(b), hello, (some text in brackets)
+requireRomansOrDigits :: (Bool, Text) -> LocatorParser Text
+requireRomansOrDigits (d, s) = if not d
+ then Prelude.fail "requireRomansOrDigits"
+ else return s
+
+pLocatorWordIntegrated :: Bool -> LocatorParser (Bool, Text)
+pLocatorWordIntegrated isFirst = try $ do
+ punct <- if isFirst
+ then return ""
+ else (stringify <$> pLocatorSep) <|> return ""
+ sp <- option "" (pSpace >> return " ")
+ (dig, s) <- pBalancedBraces [('(',')'), ('[',']'), ('{','}')] pPageSeq
+ return (dig, punct <> sp <> s)
+
+-- we want to capture: 123, 123A, C22, XVII, 33-44, 22-33; 22-11
+-- 34(1), 34A(A), 34(1)(i)(i), (1)(a)
+-- [17], [17]-[18], '591 [84]'
+-- (because CSL cannot pull out individual pages/sections
+-- to wrap in braces on a per-style basis)
+pBalancedBraces :: [(Char, Char)]
+ -> LocatorParser (Bool, Text)
+ -> LocatorParser (Bool, Text)
+pBalancedBraces braces p = try $ do
+ ss <- many1 surround
+ return $ anyWereDigitLike ss
+ where
+ except = notFollowedBy pBraces >> p
+ -- outer and inner
+ surround = foldl (\a (open, close) -> sur open close except <|> a)
+ except
+ braces
+
+ isc c = stringify <$> pMatchChar [c] (== c)
+
+ sur c c' m = try $ do
+ (d, mid) <- between (isc c) (isc c') (option (False, "") m)
+ return (d, T.cons c . flip T.snoc c' $ mid)
+
+ flattened = concatMap (\(o, c) -> [o, c]) braces
+ pBraces = pMatchChar "braces" (`elem` flattened)
+
+
+-- YES 1, 1.2, 1.2.3
+-- NO 1., 1.2. a.6
+-- can't use sepBy because we want to leave trailing .s
+pPageSeq :: LocatorParser (Bool, Text)
+pPageSeq = oneDotTwo <|> withPeriod
+ where
+ oneDotTwo = do
+ u <- pPageUnit
+ us <- many withPeriod
+ return $ anyWereDigitLike (u:us)
+ withPeriod = try $ do
+ -- .2
+ p <- pMatchChar "." (== '.')
+ u <- try pPageUnit
+ return (fst u, stringify p <> snd u)
+
+anyWereDigitLike :: [(Bool, Text)] -> (Bool, Text)
+anyWereDigitLike as = (any fst as, T.concat $ map snd as)
+
+pPageUnit :: LocatorParser (Bool, Text)
+pPageUnit = roman <|> plainUnit
+ where
+ -- roman is a 'digit'
+ roman = (True,) <$> pRoman
+ plainUnit = do
+ ts <- many1 (notFollowedBy pSpace >>
+ notFollowedBy pLocatorPunct >>
+ anyToken)
+ let s = stringify ts
+ -- otherwise look for actual digits or -s
+ return (T.any isDigit s, s)
+
+pRoman :: LocatorParser Text
+pRoman = try $ do
+ tok <- anyToken
+ case tok of
+ Str t -> case parse (romanNumeral True *> eof)
+ "roman numeral" (T.toUpper t) of
+ Left _ -> mzero
+ Right () -> return t
+ _ -> mzero
+
+pLocatorPunct :: LocatorParser Inline
+pLocatorPunct = pMatchChar "punctuation" isLocatorPunct
+
+pLocatorSep :: LocatorParser Inline
+pLocatorSep = pMatchChar "locator separator" isLocatorSep
+
+pMatchChar :: String -> (Char -> Bool) -> LocatorParser Inline
+pMatchChar msg f = satisfyTok f' <?> msg
+ where
+ f' (Str (T.unpack -> [c])) = f c
+ f' _ = False
+
+pSpace :: LocatorParser Inline
+pSpace = satisfyTok (\t -> isSpacey t || t == Str "\160") <?> "space"
+
+satisfyTok :: (Inline -> Bool) -> LocatorParser Inline
+satisfyTok f = tokenPrim show (\sp _ _ -> sp) (\tok -> if f tok
+ then Just tok
+ else Nothing)
+
+isSpacey :: Inline -> Bool
+isSpacey Space = True
+isSpacey SoftBreak = True
+isSpacey _ = False
+
+isLocatorPunct :: Char -> Bool
+isLocatorPunct '-' = False -- page range
+isLocatorPunct '–' = False -- page range, en dash
+isLocatorPunct ':' = False -- vol:page-range hack
+isLocatorPunct c = isPunctuation c -- includes [{()}]
+
+isLocatorSep :: Char -> Bool
+isLocatorSep ',' = True
+isLocatorSep ';' = True
+isLocatorSep _ = False
+
+splitStrWhen :: (Char -> Bool) -> [Inline] -> [Inline]
+splitStrWhen _ [] = []
+splitStrWhen p (Str xs : ys) = go (T.unpack xs) ++ splitStrWhen p ys
+ where
+ go [] = []
+ go s = case break p s of
+ ([],[]) -> []
+ (zs,[]) -> [Str $ T.pack zs]
+ ([],w:ws) -> Str (T.singleton w) : go ws
+ (zs,w:ws) -> Str (T.pack zs) : Str (T.singleton w) : go ws
+splitStrWhen p (x : ys) = x : splitStrWhen p ys
+
+--
+-- Locator Map
+--
+
+type LocatorMap = M.Map Text Text
+
+toLocatorMap :: Locale -> LocatorMap
+toLocatorMap locale =
+ foldr go mempty locatorTerms
+ where
+ go tname locmap =
+ case M.lookup tname (localeTerms locale) of
+ Nothing -> locmap
+ Just ts -> foldr (\x -> M.insert (snd x) tname) locmap ts
+
+locatorTerms :: [Text]
+locatorTerms =
+ [ "book"
+ , "chapter"
+ , "column"
+ , "figure"
+ , "folio"
+ , "issue"
+ , "line"
+ , "note"
+ , "opus"
+ , "page"
+ , "number-of-pages"
+ , "paragraph"
+ , "part"
+ , "section"
+ , "sub verbo"
+ , "verse"
+ , "volume" ]