From 5d17020a20543a5c4864ecb81b0caa5b64c280a9 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 2 Oct 2021 22:56:55 -0700 Subject: T.P.Writers.Shared: Export splitSentences as a Doc Text transform. [API change] Use this in man/ms. --- src/Text/Pandoc/Writers/Man.hs | 4 +-- src/Text/Pandoc/Writers/Ms.hs | 7 +++-- src/Text/Pandoc/Writers/Shared.hs | 66 ++++++++++++++++++++++++++++++++------- 3 files changed, 61 insertions(+), 16 deletions(-) (limited to 'src/Text/Pandoc') diff --git a/src/Text/Pandoc/Writers/Man.hs b/src/Text/Pandoc/Writers/Man.hs index 89789d3c5..9514a1ce7 100644 --- a/src/Text/Pandoc/Writers/Man.hs +++ b/src/Text/Pandoc/Writers/Man.hs @@ -108,10 +108,10 @@ blockToMan :: PandocMonad m -> StateT WriterState m (Doc Text) blockToMan opts (Div _ bs) = blockListToMan opts bs blockToMan opts (Plain inlines) = - inlineListToMan opts inlines + splitSentences <$> inlineListToMan opts inlines blockToMan opts (Para inlines) = do contents <- inlineListToMan opts inlines - return $ text ".PP" $$ contents + return $ text ".PP" $$ splitSentences contents blockToMan opts (LineBlock lns) = blockToMan opts $ linesToPara lns blockToMan _ b@(RawBlock f str) diff --git a/src/Text/Pandoc/Writers/Ms.hs b/src/Text/Pandoc/Writers/Ms.hs index 719407ac1..332368a67 100644 --- a/src/Text/Pandoc/Writers/Ms.hs +++ b/src/Text/Pandoc/Writers/Ms.hs @@ -142,7 +142,7 @@ blockToMs opts (Div (ident,cls,kvs) bs) = do setFirstPara return $ anchor $$ res blockToMs opts (Plain inlines) = - inlineListToMs' opts inlines + splitSentences <$> inlineListToMs' opts inlines blockToMs opts (Para [Image attr alt (src,_tit)]) | let ext = takeExtension (T.unpack src) in (ext == ".ps" || ext == ".eps") = do let (mbW,mbH) = (inPoints opts <$> dimension Width attr, @@ -155,7 +155,7 @@ blockToMs opts (Para [Image attr alt (src,_tit)]) space <> doubleQuotes (literal (tshow (floor hp :: Int))) _ -> empty - capt <- inlineListToMs' opts alt + capt <- splitSentences <$> inlineListToMs' opts alt return $ nowrap (literal ".PSPIC -C " <> doubleQuotes (literal (escapeStr opts src)) <> sizeAttrs) $$ @@ -166,7 +166,8 @@ blockToMs opts (Para inlines) = do firstPara <- gets stFirstPara resetFirstPara contents <- inlineListToMs' opts inlines - return $ literal (if firstPara then ".LP" else ".PP") $$ contents + return $ literal (if firstPara then ".LP" else ".PP") $$ + splitSentences contents blockToMs _ b@(RawBlock f str) | f == Format "ms" = return $ literal str | otherwise = do diff --git a/src/Text/Pandoc/Writers/Shared.hs b/src/Text/Pandoc/Writers/Shared.hs index 0b7c6bee0..34c4ebfba 100644 --- a/src/Text/Pandoc/Writers/Shared.hs +++ b/src/Text/Pandoc/Writers/Shared.hs @@ -36,6 +36,8 @@ module Text.Pandoc.Writers.Shared ( , toTableOfContents , endsWithPlain , toLegacyTable + , breakable + , splitSentences ) where import Safe (lastMay) @@ -49,6 +51,7 @@ import Data.List.NonEmpty (NonEmpty(..), nonEmpty) import Data.Text.Conversions (FromText(..)) import qualified Data.Map as M import qualified Data.Text as T +import Data.Text (Text) import qualified Text.Pandoc.Builder as Builder import Text.Pandoc.Definition import Text.Pandoc.Options @@ -119,13 +122,13 @@ metaValueToVal _ inlineWriter (MetaInlines is) = SimpleVal <$> inlineWriter is -- | Retrieve a field value from a template context. -getField :: FromContext a b => T.Text -> Context a -> Maybe b +getField :: FromContext a b => Text -> Context a -> Maybe b getField field (Context m) = M.lookup field m >>= fromVal -- | Set a field of a template context. If the field already has a value, -- convert it into a list with the new value appended to the old value(s). -- This is a utility function to be used in preparing template contexts. -setField :: ToContext a b => T.Text -> b -> Context a -> Context a +setField :: ToContext a b => Text -> b -> Context a -> Context a setField field val (Context m) = Context $ M.insertWith combine field (toVal val) m where @@ -135,21 +138,21 @@ setField field val (Context m) = -- | Reset a field of a template context. If the field already has a -- value, the new value replaces it. -- This is a utility function to be used in preparing template contexts. -resetField :: ToContext a b => T.Text -> b -> Context a -> Context a +resetField :: ToContext a b => Text -> b -> Context a -> Context a resetField field val (Context m) = Context (M.insert field (toVal val) m) -- | Set a field of a template context if it currently has no value. -- If it has a value, do nothing. -- This is a utility function to be used in preparing template contexts. -defField :: ToContext a b => T.Text -> b -> Context a -> Context a +defField :: ToContext a b => Text -> b -> Context a -> Context a defField field val (Context m) = Context (M.insertWith f field (toVal val) m) where f _newval oldval = oldval -- | Get the contents of the `lang` metadata field or variable. -getLang :: WriterOptions -> Meta -> Maybe T.Text +getLang :: WriterOptions -> Meta -> Maybe Text getLang opts meta = case lookupContext "lang" (writerVariables opts) of Just s -> Just s @@ -162,7 +165,7 @@ getLang opts meta = _ -> Nothing -- | Produce an HTML tag with the given pandoc attributes. -tagWithAttrs :: HasChars a => T.Text -> Attr -> Doc a +tagWithAttrs :: HasChars a => Text -> Attr -> Doc a tagWithAttrs tag (ident,classes,kvs) = hsep ["<" <> text (T.unpack tag) ,if T.null ident @@ -213,7 +216,7 @@ fixDisplayMath x = x -- | Converts a Unicode character into the ASCII sequence used to -- represent the character in "smart" Markdown. -unsmartify :: WriterOptions -> T.Text -> T.Text +unsmartify :: WriterOptions -> Text -> Text unsmartify opts = T.concatMap $ \c -> case c of '\8217' -> "'" '\8230' -> "..." @@ -345,7 +348,7 @@ gridTable opts blocksToDoc headless aligns widths headers rows = do -- | Retrieve the metadata value for a given @key@ -- and convert to Bool. -lookupMetaBool :: T.Text -> Meta -> Bool +lookupMetaBool :: Text -> Meta -> Bool lookupMetaBool key meta = case lookupMeta key meta of Just (MetaBlocks _) -> True @@ -356,7 +359,7 @@ lookupMetaBool key meta = -- | Retrieve the metadata value for a given @key@ -- and extract blocks. -lookupMetaBlocks :: T.Text -> Meta -> [Block] +lookupMetaBlocks :: Text -> Meta -> [Block] lookupMetaBlocks key meta = case lookupMeta key meta of Just (MetaBlocks bs) -> bs @@ -366,7 +369,7 @@ lookupMetaBlocks key meta = -- | Retrieve the metadata value for a given @key@ -- and extract inlines. -lookupMetaInlines :: T.Text -> Meta -> [Inline] +lookupMetaInlines :: Text -> Meta -> [Inline] lookupMetaInlines key meta = case lookupMeta key meta of Just (MetaString s) -> [Str s] @@ -377,7 +380,7 @@ lookupMetaInlines key meta = -- | Retrieve the metadata value for a given @key@ -- and convert to String. -lookupMetaString :: T.Text -> Meta -> T.Text +lookupMetaString :: Text -> Meta -> Text lookupMetaString key meta = case lookupMeta key meta of Just (MetaString s) -> s @@ -519,3 +522,44 @@ toLegacyTable (Caption _ cbody) specs thead tbodies tfoot getComponents (Cell _ _ (RowSpan h) (ColSpan w) body) = (h, w, body) + +-- | Create a breakable 'Doc' from a text. Only regular spaces +-- are break points (not tabs or nonbreaking spaces). +breakable :: Text -> Doc Text +breakable t + | T.any (== ' ') t = mconcat $ foldr go mempty (T.split (==' ') t) + | otherwise = Text (realLength t) t + where + go "" xs = + case xs of + BreakingSpace : _ -> xs + _ -> BreakingSpace : xs + go t' xs = Text (realLength t') t' : + case xs of + [] -> xs + BreakingSpace : _ -> xs + _ -> BreakingSpace : xs + +splitSentences :: Doc Text -> Doc Text +splitSentences = go . toList + where + go [] = mempty + go (Text len t : BreakingSpace : xs) = + if isSentenceEnding t + then Text len t <> NewLine <> go xs + else Text len t <> BreakingSpace <> go xs + go (x:xs) = x <> go xs + + toList (Concat (Concat a b) c) = toList (Concat a (Concat b c)) + toList (Concat a b) = a : toList b + toList x = [x] + + isSentenceEnding t = + case T.unsnoc t of + Just (t',c) + | c == '.' || c == '!' || c == '?' -> True + | c == ')' || c == ']' || c == '"' || c == '\x201D' -> + case T.unsnoc t' of + Just (_,d) -> d == '.' || d == '!' || d == '?' + _ -> False + _ -> False -- cgit v1.2.3