From 2deaa7096f186c3a87a2cbf4f3ca8a042328246e Mon Sep 17 00:00:00 2001 From: Matthew Pickering Date: Sat, 9 Aug 2014 19:27:49 +0100 Subject: Docx Reader: Added recognition of sym element in paragraphs --- src/Text/Pandoc/Readers/Docx/Parse.hs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'src/Text/Pandoc/Readers/Docx') diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index beb58fed2..5beb61f9c 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -62,7 +62,9 @@ import Control.Monad.Reader import qualified Data.Map as M import Text.Pandoc.Compat.Except import Text.Pandoc.Readers.Docx.OMath (readOMML) +import Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..)) import Text.TeXMath (Exp) +import Data.Char (readLitChar) data ReaderEnv = ReaderEnv { envNotes :: Notes , envNumbering :: Numbering @@ -673,8 +675,25 @@ elemToRunElem ns element return $ TextRun $ strContent element | isElem ns "w" "br" element = return LnBrk | isElem ns "w" "tab" element = return Tab + | isElem ns "w" "sym" element = return (getSymChar ns element) | otherwise = throwError WrongElem +-- The char attribute is a hex string +getSymChar :: NameSpaces -> Element -> RunElem +getSymChar ns element + | Just s <- getCodepoint + , Just font <- getFont = + let [(char, _)] = readLitChar ("\\x" ++ s) in + TextRun . maybe "" (:[]) $ getUnicode font char + where + getCodepoint = findAttr (elemName ns "w" "char") element + getFont = stringToFont =<< findAttr (elemName ns "w" "font") element +getSymChar _ _ = TextRun "" + +stringToFont :: String -> Maybe Font +stringToFont "Symbol" = Just Symbol +stringToFont _ = Nothing + elemToRunElems :: NameSpaces -> Element -> D [RunElem] elemToRunElems ns element | isElem ns "w" "r" element -- cgit v1.2.3 From edc57f77fc0eedd478db0343028f597261d27a53 Mon Sep 17 00:00:00 2001 From: Matthew Pickering Date: Sat, 9 Aug 2014 21:26:42 +0100 Subject: Added Text.Pandoc.Readers.Docx.Fonts --- pandoc.cabal | 1 + src/Text/Pandoc/Readers/Docx/Fonts.hs | 237 ++++++++++++++++++++++++++++++++++ 2 files changed, 238 insertions(+) create mode 100644 src/Text/Pandoc/Readers/Docx/Fonts.hs (limited to 'src/Text/Pandoc/Readers/Docx') diff --git a/pandoc.cabal b/pandoc.cabal index c2e742faa..d11e6f659 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -327,6 +327,7 @@ Library Text.Pandoc.Readers.Docx.Reducible, Text.Pandoc.Readers.Docx.Parse, Text.Pandoc.Readers.Docx.OMath, + Text.Pandoc.Readers.Docx.Fonts Text.Pandoc.Writers.Shared, Text.Pandoc.Asciify, Text.Pandoc.MIME, diff --git a/src/Text/Pandoc/Readers/Docx/Fonts.hs b/src/Text/Pandoc/Readers/Docx/Fonts.hs new file mode 100644 index 000000000..cd56eb115 --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Fonts.hs @@ -0,0 +1,237 @@ +{- +Copyright (C) 2014 Matthew Pickering + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Fonts + Copyright : Copyright (C) 2014 Matthew Pickering + License : GNU GPL, version 2 or above + + Maintainer : Matthew Pickering + Stability : alpha + Portability : portable + +Utilities to convert between font codepoints and unicode characters. +-} +module Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..)) where + + +-- | Enumeration of recognised fonts +data Font = Symbol -- ^ + +-- | Given a font and codepoint, returns the corresponding unicode +-- character +getUnicode :: Font -> Char -> Maybe Char +getUnicode Symbol c = lookup c symbol + +-- Generated from lib/fonts/symbol.txt +symbol :: [(Char, Char)] +symbol = + [ (' ',' ') + , (' ','\160') + , ('!','!') + , ('"','\8704') + , ('#','#') + , ('$','\8707') + , ('%','%') + , ('&','&') + , ('\'','\8715') + , ('(','(') + , (')',')') + , ('*','\8727') + , ('+','+') + , (',',',') + , ('-','\8722') + , ('.','.') + , ('/','/') + , ('0','0') + , ('1','1') + , ('2','2') + , ('3','3') + , ('4','4') + , ('5','5') + , ('6','6') + , ('7','7') + , ('8','8') + , ('9','9') + , (':',':') + , (';',';') + , ('<','<') + , ('=','=') + , ('>','>') + , ('?','?') + , ('@','\8773') + , ('A','\913') + , ('B','\914') + , ('C','\935') + , ('D','\916') + , ('D','\8710') + , ('E','\917') + , ('F','\934') + , ('G','\915') + , ('H','\919') + , ('I','\921') + , ('J','\977') + , ('K','\922') + , ('L','\923') + , ('M','\924') + , ('N','\925') + , ('O','\927') + , ('P','\928') + , ('Q','\920') + , ('R','\929') + , ('S','\931') + , ('T','\932') + , ('U','\933') + , ('V','\962') + , ('W','\937') + , ('W','\8486') + , ('X','\926') + , ('Y','\936') + , ('Z','\918') + , ('[','[') + , ('\\','\8756') + , (']',']') + , ('^','\8869') + , ('_','_') + , ('`','\63717') + , ('a','\945') + , ('b','\946') + , ('c','\967') + , ('d','\948') + , ('e','\949') + , ('f','\966') + , ('g','\947') + , ('h','\951') + , ('i','\953') + , ('j','\981') + , ('k','\954') + , ('l','\955') + , ('m','\181') + , ('m','\956') + , ('n','\957') + , ('o','\959') + , ('p','\960') + , ('q','\952') + , ('r','\961') + , ('s','\963') + , ('t','\964') + , ('u','\965') + , ('v','\982') + , ('w','\969') + , ('x','\958') + , ('y','\968') + , ('z','\950') + , ('{','{') + , ('|','|') + , ('}','}') + , ('~','\8764') + , ('\160','\8364') + , ('\161','\978') + , ('\162','\8242') + , ('\163','\8804') + , ('\164','\8260') + , ('\164','\8725') + , ('\165','\8734') + , ('\166','\402') + , ('\167','\9827') + , ('\168','\9830') + , ('\169','\9829') + , ('\170','\9824') + , ('\171','\8596') + , ('\172','\8592') + , ('\173','\8593') + , ('\174','\8594') + , ('\175','\8595') + , ('\176','\176') + , ('\177','\177') + , ('\178','\8243') + , ('\179','\8805') + , ('\180','\215') + , ('\181','\8733') + , ('\182','\8706') + , ('\183','\8226') + , ('\184','\247') + , ('\185','\8800') + , ('\186','\8801') + , ('\187','\8776') + , ('\188','\8230') + , ('\189','\63718') + , ('\190','\63719') + , ('\191','\8629') + , ('\192','\8501') + , ('\193','\8465') + , ('\194','\8476') + , ('\195','\8472') + , ('\196','\8855') + , ('\197','\8853') + , ('\198','\8709') + , ('\199','\8745') + , ('\200','\8746') + , ('\201','\8835') + , ('\202','\8839') + , ('\203','\8836') + , ('\204','\8834') + , ('\205','\8838') + , ('\206','\8712') + , ('\207','\8713') + , ('\208','\8736') + , ('\209','\8711') + , ('\210','\63194') + , ('\211','\63193') + , ('\212','\63195') + , ('\213','\8719') + , ('\214','\8730') + , ('\215','\8901') + , ('\216','\172') + , ('\217','\8743') + , ('\218','\8744') + , ('\219','\8660') + , ('\220','\8656') + , ('\221','\8657') + , ('\222','\8658') + , ('\223','\8659') + , ('\224','\9674') + , ('\225','\9001') + , ('\226','\63720') + , ('\227','\63721') + , ('\228','\63722') + , ('\229','\8721') + , ('\230','\63723') + , ('\231','\63724') + , ('\232','\63725') + , ('\233','\63726') + , ('\234','\63727') + , ('\235','\63728') + , ('\236','\63729') + , ('\237','\63730') + , ('\238','\63731') + , ('\239','\63732') + , ('\241','\9002') + , ('\242','\8747') + , ('\243','\8992') + , ('\244','\63733') + , ('\245','\8993') + , ('\246','\63734') + , ('\247','\63735') + , ('\248','\63736') + , ('\249','\63737') + , ('\250','\63738') + , ('\251','\63739') + , ('\252','\63740') + , ('\253','\63741') + , ('\254','\63742')] -- cgit v1.2.3 From 3bb19307f60c6cfda4bfdb5a3d53508f4abd786e Mon Sep 17 00:00:00 2001 From: Matthew Pickering Date: Sat, 9 Aug 2014 22:06:07 +0100 Subject: Docx Parse: Recognises code points in sym elements which are in the private range --- src/Text/Pandoc/Readers/Docx/Parse.hs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'src/Text/Pandoc/Readers/Docx') diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 5beb61f9c..b12062407 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -59,6 +59,7 @@ import Data.Bits ((.|.)) import qualified Data.ByteString.Lazy as B import qualified Text.Pandoc.UTF8 as UTF8 import Control.Monad.Reader +import Control.Applicative ((<$>)) import qualified Data.Map as M import Text.Pandoc.Compat.Except import Text.Pandoc.Readers.Docx.OMath (readOMML) @@ -681,13 +682,15 @@ elemToRunElem ns element -- The char attribute is a hex string getSymChar :: NameSpaces -> Element -> RunElem getSymChar ns element - | Just s <- getCodepoint + | Just s <- lowerFromPrivate <$> getCodepoint , Just font <- getFont = let [(char, _)] = readLitChar ("\\x" ++ s) in TextRun . maybe "" (:[]) $ getUnicode font char where getCodepoint = findAttr (elemName ns "w" "char") element getFont = stringToFont =<< findAttr (elemName ns "w" "font") element + lowerFromPrivate ('F':xs) = '0':xs + lowerFromPrivate xs = xs getSymChar _ _ = TextRun "" stringToFont :: String -> Maybe Font -- cgit v1.2.3