diff options
Diffstat (limited to 'src/Text/Pandoc/Citeproc/Locator.hs')
-rw-r--r-- | src/Text/Pandoc/Citeproc/Locator.hs | 279 |
1 files changed, 279 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Citeproc/Locator.hs b/src/Text/Pandoc/Citeproc/Locator.hs new file mode 100644 index 000000000..dba762c02 --- /dev/null +++ b/src/Text/Pandoc/Citeproc/Locator.hs @@ -0,0 +1,279 @@ +{-# LANGUAGE TupleSections #-} +{-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE OverloadedStrings #-} +module Text.Pandoc.Citeproc.Locator + ( parseLocator ) +where +import Citeproc.Types +import Data.Text (Text) +import qualified Data.Text as T +import Text.Parsec +import Text.Pandoc.Definition +import Text.Pandoc.Parsing (romanNumeral) +import Text.Pandoc.Shared (stringify) +import Control.Monad (mzero) +import qualified Data.Map as M +import Data.Char (isSpace, isPunctuation, isDigit) + +parseLocator :: Locale -> [Inline] -> (Maybe (Text, Text), [Inline]) +parseLocator locale inp = + case parse (pLocatorWords (toLocatorMap locale)) "suffix" $ splitInp inp of + Right r -> r + Left _ -> (Nothing, inp) + +splitInp :: [Inline] -> [Inline] +splitInp = splitStrWhen (\c -> isSpace c || (isPunctuation c && c /= ':')) + +-- +-- Locator parsing +-- + +type LocatorParser = Parsec [Inline] () + +pLocatorWords :: LocatorMap + -> LocatorParser (Maybe (Text, Text), [Inline]) +pLocatorWords locMap = do + optional $ pMatchChar "," (== ',') + optional pSpace + (la, lo) <- pLocatorDelimited locMap <|> pLocatorIntegrated locMap + s <- getInput -- rest is suffix + -- need to trim, otherwise "p. 9" and "9" will have 'different' locators later on + -- i.e. the first one will be " 9" + return $ + if T.null la && T.null lo + then (Nothing, s) + else (Just (la, T.strip lo), s) + +pLocatorDelimited :: LocatorMap -> LocatorParser (Text, Text) +pLocatorDelimited locMap = try $ do + _ <- pMatchChar "{" (== '{') + skipMany pSpace -- gobble pre-spaces so label doesn't try to include them + (la, _) <- pLocatorLabelDelimited locMap + -- we only care about balancing {} and [] (because of the outer [] scope); + -- the rest can be anything + let inner = do { t <- anyToken; return (True, stringify t) } + gs <- many (pBalancedBraces [('{','}'), ('[',']')] inner) + _ <- pMatchChar "}" (== '}') + let lo = T.concat $ map snd gs + return (la, lo) + +pLocatorLabelDelimited :: LocatorMap -> LocatorParser (Text, Bool) +pLocatorLabelDelimited locMap + = pLocatorLabel' locMap lim <|> return ("page", True) + where + lim = stringify <$> anyToken + +pLocatorIntegrated :: LocatorMap -> LocatorParser (Text, Text) +pLocatorIntegrated locMap = try $ do + (la, wasImplicit) <- pLocatorLabelIntegrated locMap + -- if we got the label implicitly, we have presupposed the first one is + -- going to have a digit, so guarantee that. You _can_ have p. (a) + -- because you specified it. + let modifier = if wasImplicit + then requireDigits + else requireRomansOrDigits + g <- try $ pLocatorWordIntegrated (not wasImplicit) >>= modifier + gs <- many (try $ pLocatorWordIntegrated False >>= modifier) + let lo = T.concat (g:gs) + return (la, lo) + +pLocatorLabelIntegrated :: LocatorMap -> LocatorParser (Text, Bool) +pLocatorLabelIntegrated locMap + = pLocatorLabel' locMap lim <|> (lookAhead digital >> return ("page", True)) + where + lim = try $ pLocatorWordIntegrated True >>= requireRomansOrDigits + digital = try $ pLocatorWordIntegrated True >>= requireDigits + +pLocatorLabel' :: LocatorMap -> LocatorParser Text + -> LocatorParser (Text, Bool) +pLocatorLabel' locMap lim = go "" + where + -- grow the match string until we hit the end + -- trying to find the largest match for a label + go acc = try $ do + -- advance at least one token each time + -- the pathological case is "p.3" + t <- anyToken + ts <- manyTill anyToken (try $ lookAhead lim) + let s = acc <> stringify (t:ts) + case M.lookup (T.strip s) locMap of + -- try to find a longer one, or return this one + Just l -> go s <|> return (l, False) + Nothing -> go s + +-- hard requirement for a locator to have some real digits in it +requireDigits :: (Bool, Text) -> LocatorParser Text +requireDigits (_, s) = if not (T.any isDigit s) + then Prelude.fail "requireDigits" + else return s + +-- soft requirement for a sequence with some roman or arabic parts +-- (a)(iv) -- because iv is roman +-- 1(a) -- because 1 is an actual digit +-- NOT: a, (a)-(b), hello, (some text in brackets) +requireRomansOrDigits :: (Bool, Text) -> LocatorParser Text +requireRomansOrDigits (d, s) = if not d + then Prelude.fail "requireRomansOrDigits" + else return s + +pLocatorWordIntegrated :: Bool -> LocatorParser (Bool, Text) +pLocatorWordIntegrated isFirst = try $ do + punct <- if isFirst + then return "" + else (stringify <$> pLocatorSep) <|> return "" + sp <- option "" (pSpace >> return " ") + (dig, s) <- pBalancedBraces [('(',')'), ('[',']'), ('{','}')] pPageSeq + return (dig, punct <> sp <> s) + +-- we want to capture: 123, 123A, C22, XVII, 33-44, 22-33; 22-11 +-- 34(1), 34A(A), 34(1)(i)(i), (1)(a) +-- [17], [17]-[18], '591 [84]' +-- (because CSL cannot pull out individual pages/sections +-- to wrap in braces on a per-style basis) +pBalancedBraces :: [(Char, Char)] + -> LocatorParser (Bool, Text) + -> LocatorParser (Bool, Text) +pBalancedBraces braces p = try $ do + ss <- many1 surround + return $ anyWereDigitLike ss + where + except = notFollowedBy pBraces >> p + -- outer and inner + surround = foldl (\a (open, close) -> sur open close except <|> a) + except + braces + + isc c = stringify <$> pMatchChar [c] (== c) + + sur c c' m = try $ do + (d, mid) <- between (isc c) (isc c') (option (False, "") m) + return (d, T.cons c . flip T.snoc c' $ mid) + + flattened = concatMap (\(o, c) -> [o, c]) braces + pBraces = pMatchChar "braces" (`elem` flattened) + + +-- YES 1, 1.2, 1.2.3 +-- NO 1., 1.2. a.6 +-- can't use sepBy because we want to leave trailing .s +pPageSeq :: LocatorParser (Bool, Text) +pPageSeq = oneDotTwo <|> withPeriod + where + oneDotTwo = do + u <- pPageUnit + us <- many withPeriod + return $ anyWereDigitLike (u:us) + withPeriod = try $ do + -- .2 + p <- pMatchChar "." (== '.') + u <- try pPageUnit + return (fst u, stringify p <> snd u) + +anyWereDigitLike :: [(Bool, Text)] -> (Bool, Text) +anyWereDigitLike as = (any fst as, T.concat $ map snd as) + +pPageUnit :: LocatorParser (Bool, Text) +pPageUnit = roman <|> plainUnit + where + -- roman is a 'digit' + roman = (True,) <$> pRoman + plainUnit = do + ts <- many1 (notFollowedBy pSpace >> + notFollowedBy pLocatorPunct >> + anyToken) + let s = stringify ts + -- otherwise look for actual digits or -s + return (T.any isDigit s, s) + +pRoman :: LocatorParser Text +pRoman = try $ do + tok <- anyToken + case tok of + Str t -> case parse (romanNumeral True *> eof) + "roman numeral" (T.toUpper t) of + Left _ -> mzero + Right () -> return t + _ -> mzero + +pLocatorPunct :: LocatorParser Inline +pLocatorPunct = pMatchChar "punctuation" isLocatorPunct + +pLocatorSep :: LocatorParser Inline +pLocatorSep = pMatchChar "locator separator" isLocatorSep + +pMatchChar :: String -> (Char -> Bool) -> LocatorParser Inline +pMatchChar msg f = satisfyTok f' <?> msg + where + f' (Str (T.unpack -> [c])) = f c + f' _ = False + +pSpace :: LocatorParser Inline +pSpace = satisfyTok (\t -> isSpacey t || t == Str "\160") <?> "space" + +satisfyTok :: (Inline -> Bool) -> LocatorParser Inline +satisfyTok f = tokenPrim show (\sp _ _ -> sp) (\tok -> if f tok + then Just tok + else Nothing) + +isSpacey :: Inline -> Bool +isSpacey Space = True +isSpacey SoftBreak = True +isSpacey _ = False + +isLocatorPunct :: Char -> Bool +isLocatorPunct '-' = False -- page range +isLocatorPunct '–' = False -- page range, en dash +isLocatorPunct ':' = False -- vol:page-range hack +isLocatorPunct c = isPunctuation c -- includes [{()}] + +isLocatorSep :: Char -> Bool +isLocatorSep ',' = True +isLocatorSep ';' = True +isLocatorSep _ = False + +splitStrWhen :: (Char -> Bool) -> [Inline] -> [Inline] +splitStrWhen _ [] = [] +splitStrWhen p (Str xs : ys) = go (T.unpack xs) ++ splitStrWhen p ys + where + go [] = [] + go s = case break p s of + ([],[]) -> [] + (zs,[]) -> [Str $ T.pack zs] + ([],w:ws) -> Str (T.singleton w) : go ws + (zs,w:ws) -> Str (T.pack zs) : Str (T.singleton w) : go ws +splitStrWhen p (x : ys) = x : splitStrWhen p ys + +-- +-- Locator Map +-- + +type LocatorMap = M.Map Text Text + +toLocatorMap :: Locale -> LocatorMap +toLocatorMap locale = + foldr go mempty locatorTerms + where + go tname locmap = + case M.lookup tname (localeTerms locale) of + Nothing -> locmap + Just ts -> foldr (\x -> M.insert (snd x) tname) locmap ts + +locatorTerms :: [Text] +locatorTerms = + [ "book" + , "chapter" + , "column" + , "figure" + , "folio" + , "issue" + , "line" + , "note" + , "opus" + , "page" + , "number-of-pages" + , "paragraph" + , "part" + , "section" + , "sub verbo" + , "verse" + , "volume" ] |