diff options
-rw-r--r-- | lib/fonts/Makefile | 6 | ||||
-rw-r--r-- | lib/fonts/parseUnicodeMapping.hs | 40 | ||||
-rw-r--r-- | lib/fonts/symbol.txt | 256 | ||||
-rw-r--r-- | pandoc.cabal | 4 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Fonts.hs | 237 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/OMath.hs | 438 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 24 | ||||
-rw-r--r-- | tests/docx.unicode.docx | bin | 13098 -> 11472 bytes | |||
-rw-r--r-- | tests/docx.unicode.native | 2 | ||||
-rw-r--r-- | tests/epub.features.native | 2 |
10 files changed, 566 insertions, 443 deletions
diff --git a/lib/fonts/Makefile b/lib/fonts/Makefile new file mode 100644 index 000000000..5693ee054 --- /dev/null +++ b/lib/fonts/Makefile @@ -0,0 +1,6 @@ +symbol.hs: symbol.txt + runghc parseUnicodeMapping.hs symbol.txt + +.PHONY: clean +clean: + -rm symbol.hs diff --git a/lib/fonts/parseUnicodeMapping.hs b/lib/fonts/parseUnicodeMapping.hs new file mode 100644 index 000000000..4f7ff692b --- /dev/null +++ b/lib/fonts/parseUnicodeMapping.hs @@ -0,0 +1,40 @@ +import System.FilePath +import Text.Parsec +import Data.Char +import System.Environment +import Control.Applicative hiding (many) +import Data.List + +main :: IO () +main = (head <$> getArgs) >>= parseUnicodeMapping + + +parseUnicodeMapping :: FilePath -> IO () +parseUnicodeMapping fname = do + fin <- readFile fname + let mapname = dropExtension . takeFileName $ fname + let res = runParse fin + let header = "-- Generated from " ++ fname ++ "\n" ++ + mapname ++ " :: [(Char, Char)]\n" ++ mapname ++" =\n [ " + let footer = "]" + writeFile (replaceExtension fname ".hs") + (header ++ (concat $ intersperse "\n , " (map show res)) ++ footer) + +type Unicode = Char + +runParse :: String -> [(Char, Unicode)] +runParse s= either (error . show) id (parse parseMap "" s) + +anyline = manyTill anyChar newline + +getHexChar :: Parsec String () Char +getHexChar = do + [(c,_)] <- readLitChar . ("\\x" ++) <$> many1 hexDigit + return c + +parseMap :: Parsec String () [(Char, Unicode)] +parseMap = do + skipMany (char '#' >> anyline) + many (flip (,) <$> getHexChar <* tab <*> getHexChar <* anyline) + + diff --git a/lib/fonts/symbol.txt b/lib/fonts/symbol.txt new file mode 100644 index 000000000..b98baf6cf --- /dev/null +++ b/lib/fonts/symbol.txt @@ -0,0 +1,256 @@ +# +# Name: Adobe Symbol Encoding to Unicode +# Unicode version: 2.0 +# Table version: 1.0 +# Date: 2011 July 12 +# +# Copyright (c) 1991-2011 Unicode, Inc. All Rights reserved. +# +# This file is provided as-is by Unicode, Inc. (The Unicode Consortium). No +# claims are made as to fitness for any particular purpose. No warranties of +# any kind are expressed or implied. The recipient agrees to determine +# applicability of information provided. If this file has been provided on +# magnetic media by Unicode, Inc., the sole remedy for any claim will be +# exchange of defective media within 90 days of receipt. +# +# Unicode, Inc. hereby grants the right to freely use the information +# supplied in this file in the creation of products supporting the +# Unicode Standard, and to make copies of this file in any form for +# internal or external distribution as long as this notice remains +# attached. +# +# Format: 4 tab-delimited fields: +# +# (1) The Unicode value (in hexadecimal) +# (2) The Symbol Encoding code point (in hexadecimal) +# (3) # Unicode name +# (4) # PostScript character name +# +# General Notes: +# +# The Unicode values in this table were produced as the result of applying +# the algorithm described in the section "Populating a Unicode space" in the +# document "Unicode and Glyph Names," at +# http://partners.adobe.com/asn/developer/typeforum/unicodegn.html +# to the characters in Symbol. Note that some characters, such as "space", +# are mapped to 2 Unicode values. 29 characters have assignments in the +# Corporate Use Subarea; these are indicated by "(CUS)" in field 4. Refer to +# the above document for more details. +# +# 2011 July 12: The above link is no longer valid. For comparable, +# more current information, see the document, "Glyph", at: +# <http://www.adobe.com/devnet/opentype/archives/glyph.html> +# +# Revision History: +# +# [v1.0, 2011 July 12] +# Updated terms of use to current wording. +# Updated contact information and document link. +# No changes to the mapping data. +# +# [v0.2, 30 March 1999] +# Different algorithm to produce Unicode values (see notes above) results in +# some character codes being mapped to 2 Unicode values; use of Corporate +# Use subarea values; addition of the euro character; changed assignments of +# some characters such as the COPYRIGHT SIGNs and RADICAL EXTENDER. Updated +# Unicode names to Unicode 2.0 names. +# +# [v0.1, 5 May 1995] First release. +# +# Use the Unicode reporting form <http://www.unicode.org/reporting.html> +# for any questions or comments or to report errors in the data. +# +0020 20 # SPACE # space +00A0 20 # NO-BREAK SPACE # space +0021 21 # EXCLAMATION MARK # exclam +2200 22 # FOR ALL # universal +0023 23 # NUMBER SIGN # numbersign +2203 24 # THERE EXISTS # existential +0025 25 # PERCENT SIGN # percent +0026 26 # AMPERSAND # ampersand +220B 27 # CONTAINS AS MEMBER # suchthat +0028 28 # LEFT PARENTHESIS # parenleft +0029 29 # RIGHT PARENTHESIS # parenright +2217 2A # ASTERISK OPERATOR # asteriskmath +002B 2B # PLUS SIGN # plus +002C 2C # COMMA # comma +2212 2D # MINUS SIGN # minus +002E 2E # FULL STOP # period +002F 2F # SOLIDUS # slash +0030 30 # DIGIT ZERO # zero +0031 31 # DIGIT ONE # one +0032 32 # DIGIT TWO # two +0033 33 # DIGIT THREE # three +0034 34 # DIGIT FOUR # four +0035 35 # DIGIT FIVE # five +0036 36 # DIGIT SIX # six +0037 37 # DIGIT SEVEN # seven +0038 38 # DIGIT EIGHT # eight +0039 39 # DIGIT NINE # nine +003A 3A # COLON # colon +003B 3B # SEMICOLON # semicolon +003C 3C # LESS-THAN SIGN # less +003D 3D # EQUALS SIGN # equal +003E 3E # GREATER-THAN SIGN # greater +003F 3F # QUESTION MARK # question +2245 40 # APPROXIMATELY EQUAL TO # congruent +0391 41 # GREEK CAPITAL LETTER ALPHA # Alpha +0392 42 # GREEK CAPITAL LETTER BETA # Beta +03A7 43 # GREEK CAPITAL LETTER CHI # Chi +0394 44 # GREEK CAPITAL LETTER DELTA # Delta +2206 44 # INCREMENT # Delta +0395 45 # GREEK CAPITAL LETTER EPSILON # Epsilon +03A6 46 # GREEK CAPITAL LETTER PHI # Phi +0393 47 # GREEK CAPITAL LETTER GAMMA # Gamma +0397 48 # GREEK CAPITAL LETTER ETA # Eta +0399 49 # GREEK CAPITAL LETTER IOTA # Iota +03D1 4A # GREEK THETA SYMBOL # theta1 +039A 4B # GREEK CAPITAL LETTER KAPPA # Kappa +039B 4C # GREEK CAPITAL LETTER LAMDA # Lambda +039C 4D # GREEK CAPITAL LETTER MU # Mu +039D 4E # GREEK CAPITAL LETTER NU # Nu +039F 4F # GREEK CAPITAL LETTER OMICRON # Omicron +03A0 50 # GREEK CAPITAL LETTER PI # Pi +0398 51 # GREEK CAPITAL LETTER THETA # Theta +03A1 52 # GREEK CAPITAL LETTER RHO # Rho +03A3 53 # GREEK CAPITAL LETTER SIGMA # Sigma +03A4 54 # GREEK CAPITAL LETTER TAU # Tau +03A5 55 # GREEK CAPITAL LETTER UPSILON # Upsilon +03C2 56 # GREEK SMALL LETTER FINAL SIGMA # sigma1 +03A9 57 # GREEK CAPITAL LETTER OMEGA # Omega +2126 57 # OHM SIGN # Omega +039E 58 # GREEK CAPITAL LETTER XI # Xi +03A8 59 # GREEK CAPITAL LETTER PSI # Psi +0396 5A # GREEK CAPITAL LETTER ZETA # Zeta +005B 5B # LEFT SQUARE BRACKET # bracketleft +2234 5C # THEREFORE # therefore +005D 5D # RIGHT SQUARE BRACKET # bracketright +22A5 5E # UP TACK # perpendicular +005F 5F # LOW LINE # underscore +F8E5 60 # RADICAL EXTENDER # radicalex (CUS) +03B1 61 # GREEK SMALL LETTER ALPHA # alpha +03B2 62 # GREEK SMALL LETTER BETA # beta +03C7 63 # GREEK SMALL LETTER CHI # chi +03B4 64 # GREEK SMALL LETTER DELTA # delta +03B5 65 # GREEK SMALL LETTER EPSILON # epsilon +03C6 66 # GREEK SMALL LETTER PHI # phi +03B3 67 # GREEK SMALL LETTER GAMMA # gamma +03B7 68 # GREEK SMALL LETTER ETA # eta +03B9 69 # GREEK SMALL LETTER IOTA # iota +03D5 6A # GREEK PHI SYMBOL # phi1 +03BA 6B # GREEK SMALL LETTER KAPPA # kappa +03BB 6C # GREEK SMALL LETTER LAMDA # lambda +00B5 6D # MICRO SIGN # mu +03BC 6D # GREEK SMALL LETTER MU # mu +03BD 6E # GREEK SMALL LETTER NU # nu +03BF 6F # GREEK SMALL LETTER OMICRON # omicron +03C0 70 # GREEK SMALL LETTER PI # pi +03B8 71 # GREEK SMALL LETTER THETA # theta +03C1 72 # GREEK SMALL LETTER RHO # rho +03C3 73 # GREEK SMALL LETTER SIGMA # sigma +03C4 74 # GREEK SMALL LETTER TAU # tau +03C5 75 # GREEK SMALL LETTER UPSILON # upsilon +03D6 76 # GREEK PI SYMBOL # omega1 +03C9 77 # GREEK SMALL LETTER OMEGA # omega +03BE 78 # GREEK SMALL LETTER XI # xi +03C8 79 # GREEK SMALL LETTER PSI # psi +03B6 7A # GREEK SMALL LETTER ZETA # zeta +007B 7B # LEFT CURLY BRACKET # braceleft +007C 7C # VERTICAL LINE # bar +007D 7D # RIGHT CURLY BRACKET # braceright +223C 7E # TILDE OPERATOR # similar +20AC A0 # EURO SIGN # Euro +03D2 A1 # GREEK UPSILON WITH HOOK SYMBOL # Upsilon1 +2032 A2 # PRIME # minute +2264 A3 # LESS-THAN OR EQUAL TO # lessequal +2044 A4 # FRACTION SLASH # fraction +2215 A4 # DIVISION SLASH # fraction +221E A5 # INFINITY # infinity +0192 A6 # LATIN SMALL LETTER F WITH HOOK # florin +2663 A7 # BLACK CLUB SUIT # club +2666 A8 # BLACK DIAMOND SUIT # diamond +2665 A9 # BLACK HEART SUIT # heart +2660 AA # BLACK SPADE SUIT # spade +2194 AB # LEFT RIGHT ARROW # arrowboth +2190 AC # LEFTWARDS ARROW # arrowleft +2191 AD # UPWARDS ARROW # arrowup +2192 AE # RIGHTWARDS ARROW # arrowright +2193 AF # DOWNWARDS ARROW # arrowdown +00B0 B0 # DEGREE SIGN # degree +00B1 B1 # PLUS-MINUS SIGN # plusminus +2033 B2 # DOUBLE PRIME # second +2265 B3 # GREATER-THAN OR EQUAL TO # greaterequal +00D7 B4 # MULTIPLICATION SIGN # multiply +221D B5 # PROPORTIONAL TO # proportional +2202 B6 # PARTIAL DIFFERENTIAL # partialdiff +2022 B7 # BULLET # bullet +00F7 B8 # DIVISION SIGN # divide +2260 B9 # NOT EQUAL TO # notequal +2261 BA # IDENTICAL TO # equivalence +2248 BB # ALMOST EQUAL TO # approxequal +2026 BC # HORIZONTAL ELLIPSIS # ellipsis +F8E6 BD # VERTICAL ARROW EXTENDER # arrowvertex (CUS) +F8E7 BE # HORIZONTAL ARROW EXTENDER # arrowhorizex (CUS) +21B5 BF # DOWNWARDS ARROW WITH CORNER LEFTWARDS # carriagereturn +2135 C0 # ALEF SYMBOL # aleph +2111 C1 # BLACK-LETTER CAPITAL I # Ifraktur +211C C2 # BLACK-LETTER CAPITAL R # Rfraktur +2118 C3 # SCRIPT CAPITAL P # weierstrass +2297 C4 # CIRCLED TIMES # circlemultiply +2295 C5 # CIRCLED PLUS # circleplus +2205 C6 # EMPTY SET # emptyset +2229 C7 # INTERSECTION # intersection +222A C8 # UNION # union +2283 C9 # SUPERSET OF # propersuperset +2287 CA # SUPERSET OF OR EQUAL TO # reflexsuperset +2284 CB # NOT A SUBSET OF # notsubset +2282 CC # SUBSET OF # propersubset +2286 CD # SUBSET OF OR EQUAL TO # reflexsubset +2208 CE # ELEMENT OF # element +2209 CF # NOT AN ELEMENT OF # notelement +2220 D0 # ANGLE # angle +2207 D1 # NABLA # gradient +F6DA D2 # REGISTERED SIGN SERIF # registerserif (CUS) +F6D9 D3 # COPYRIGHT SIGN SERIF # copyrightserif (CUS) +F6DB D4 # TRADE MARK SIGN SERIF # trademarkserif (CUS) +220F D5 # N-ARY PRODUCT # product +221A D6 # SQUARE ROOT # radical +22C5 D7 # DOT OPERATOR # dotmath +00AC D8 # NOT SIGN # logicalnot +2227 D9 # LOGICAL AND # logicaland +2228 DA # LOGICAL OR # logicalor +21D4 DB # LEFT RIGHT DOUBLE ARROW # arrowdblboth +21D0 DC # LEFTWARDS DOUBLE ARROW # arrowdblleft +21D1 DD # UPWARDS DOUBLE ARROW # arrowdblup +21D2 DE # RIGHTWARDS DOUBLE ARROW # arrowdblright +21D3 DF # DOWNWARDS DOUBLE ARROW # arrowdbldown +25CA E0 # LOZENGE # lozenge +2329 E1 # LEFT-POINTING ANGLE BRACKET # angleleft +F8E8 E2 # REGISTERED SIGN SANS SERIF # registersans (CUS) +F8E9 E3 # COPYRIGHT SIGN SANS SERIF # copyrightsans (CUS) +F8EA E4 # TRADE MARK SIGN SANS SERIF # trademarksans (CUS) +2211 E5 # N-ARY SUMMATION # summation +F8EB E6 # LEFT PAREN TOP # parenlefttp (CUS) +F8EC E7 # LEFT PAREN EXTENDER # parenleftex (CUS) +F8ED E8 # LEFT PAREN BOTTOM # parenleftbt (CUS) +F8EE E9 # LEFT SQUARE BRACKET TOP # bracketlefttp (CUS) +F8EF EA # LEFT SQUARE BRACKET EXTENDER # bracketleftex (CUS) +F8F0 EB # LEFT SQUARE BRACKET BOTTOM # bracketleftbt (CUS) +F8F1 EC # LEFT CURLY BRACKET TOP # bracelefttp (CUS) +F8F2 ED # LEFT CURLY BRACKET MID # braceleftmid (CUS) +F8F3 EE # LEFT CURLY BRACKET BOTTOM # braceleftbt (CUS) +F8F4 EF # CURLY BRACKET EXTENDER # braceex (CUS) +232A F1 # RIGHT-POINTING ANGLE BRACKET # angleright +222B F2 # INTEGRAL # integral +2320 F3 # TOP HALF INTEGRAL # integraltp +F8F5 F4 # INTEGRAL EXTENDER # integralex (CUS) +2321 F5 # BOTTOM HALF INTEGRAL # integralbt +F8F6 F6 # RIGHT PAREN TOP # parenrighttp (CUS) +F8F7 F7 # RIGHT PAREN EXTENDER # parenrightex (CUS) +F8F8 F8 # RIGHT PAREN BOTTOM # parenrightbt (CUS) +F8F9 F9 # RIGHT SQUARE BRACKET TOP # bracketrighttp (CUS) +F8FA FA # RIGHT SQUARE BRACKET EXTENDER # bracketrightex (CUS) +F8FB FB # RIGHT SQUARE BRACKET BOTTOM # bracketrightbt (CUS) +F8FC FC # RIGHT CURLY BRACKET TOP # bracerighttp (CUS) +F8FD FD # RIGHT CURLY BRACKET MID # bracerightmid (CUS) +F8FE FE # RIGHT CURLY BRACKET BOTTOM # bracerightbt (CUS) diff --git a/pandoc.cabal b/pandoc.cabal index c2e742faa..77e1c5d42 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -230,7 +230,7 @@ Library old-locale >= 1 && < 1.1, time >= 1.2 && < 1.5, HTTP >= 4000.0.5 && < 4000.3, - texmath >= 0.7 && < 0.8, + texmath >= 0.8 && < 0.9, xml >= 1.3.12 && < 1.4, random >= 1 && < 1.1, extensible-exceptions >= 0.1 && < 0.2, @@ -326,7 +326,7 @@ Library Other-Modules: Text.Pandoc.Readers.Docx.Lists, Text.Pandoc.Readers.Docx.Reducible, Text.Pandoc.Readers.Docx.Parse, - Text.Pandoc.Readers.Docx.OMath, + Text.Pandoc.Readers.Docx.Fonts Text.Pandoc.Writers.Shared, Text.Pandoc.Asciify, Text.Pandoc.MIME, diff --git a/src/Text/Pandoc/Readers/Docx/Fonts.hs b/src/Text/Pandoc/Readers/Docx/Fonts.hs new file mode 100644 index 000000000..cd56eb115 --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Fonts.hs @@ -0,0 +1,237 @@ +{- +Copyright (C) 2014 Matthew Pickering <matthewtpickering@gmail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Fonts + Copyright : Copyright (C) 2014 Matthew Pickering + License : GNU GPL, version 2 or above + + Maintainer : Matthew Pickering <matthewtpickering@gmail.com> + Stability : alpha + Portability : portable + +Utilities to convert between font codepoints and unicode characters. +-} +module Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..)) where + + +-- | Enumeration of recognised fonts +data Font = Symbol -- ^ <http://en.wikipedia.org/wiki/Symbol_(typeface) Adobe Symbol> + +-- | Given a font and codepoint, returns the corresponding unicode +-- character +getUnicode :: Font -> Char -> Maybe Char +getUnicode Symbol c = lookup c symbol + +-- Generated from lib/fonts/symbol.txt +symbol :: [(Char, Char)] +symbol = + [ (' ',' ') + , (' ','\160') + , ('!','!') + , ('"','\8704') + , ('#','#') + , ('$','\8707') + , ('%','%') + , ('&','&') + , ('\'','\8715') + , ('(','(') + , (')',')') + , ('*','\8727') + , ('+','+') + , (',',',') + , ('-','\8722') + , ('.','.') + , ('/','/') + , ('0','0') + , ('1','1') + , ('2','2') + , ('3','3') + , ('4','4') + , ('5','5') + , ('6','6') + , ('7','7') + , ('8','8') + , ('9','9') + , (':',':') + , (';',';') + , ('<','<') + , ('=','=') + , ('>','>') + , ('?','?') + , ('@','\8773') + , ('A','\913') + , ('B','\914') + , ('C','\935') + , ('D','\916') + , ('D','\8710') + , ('E','\917') + , ('F','\934') + , ('G','\915') + , ('H','\919') + , ('I','\921') + , ('J','\977') + , ('K','\922') + , ('L','\923') + , ('M','\924') + , ('N','\925') + , ('O','\927') + , ('P','\928') + , ('Q','\920') + , ('R','\929') + , ('S','\931') + , ('T','\932') + , ('U','\933') + , ('V','\962') + , ('W','\937') + , ('W','\8486') + , ('X','\926') + , ('Y','\936') + , ('Z','\918') + , ('[','[') + , ('\\','\8756') + , (']',']') + , ('^','\8869') + , ('_','_') + , ('`','\63717') + , ('a','\945') + , ('b','\946') + , ('c','\967') + , ('d','\948') + , ('e','\949') + , ('f','\966') + , ('g','\947') + , ('h','\951') + , ('i','\953') + , ('j','\981') + , ('k','\954') + , ('l','\955') + , ('m','\181') + , ('m','\956') + , ('n','\957') + , ('o','\959') + , ('p','\960') + , ('q','\952') + , ('r','\961') + , ('s','\963') + , ('t','\964') + , ('u','\965') + , ('v','\982') + , ('w','\969') + , ('x','\958') + , ('y','\968') + , ('z','\950') + , ('{','{') + , ('|','|') + , ('}','}') + , ('~','\8764') + , ('\160','\8364') + , ('\161','\978') + , ('\162','\8242') + , ('\163','\8804') + , ('\164','\8260') + , ('\164','\8725') + , ('\165','\8734') + , ('\166','\402') + , ('\167','\9827') + , ('\168','\9830') + , ('\169','\9829') + , ('\170','\9824') + , ('\171','\8596') + , ('\172','\8592') + , ('\173','\8593') + , ('\174','\8594') + , ('\175','\8595') + , ('\176','\176') + , ('\177','\177') + , ('\178','\8243') + , ('\179','\8805') + , ('\180','\215') + , ('\181','\8733') + , ('\182','\8706') + , ('\183','\8226') + , ('\184','\247') + , ('\185','\8800') + , ('\186','\8801') + , ('\187','\8776') + , ('\188','\8230') + , ('\189','\63718') + , ('\190','\63719') + , ('\191','\8629') + , ('\192','\8501') + , ('\193','\8465') + , ('\194','\8476') + , ('\195','\8472') + , ('\196','\8855') + , ('\197','\8853') + , ('\198','\8709') + , ('\199','\8745') + , ('\200','\8746') + , ('\201','\8835') + , ('\202','\8839') + , ('\203','\8836') + , ('\204','\8834') + , ('\205','\8838') + , ('\206','\8712') + , ('\207','\8713') + , ('\208','\8736') + , ('\209','\8711') + , ('\210','\63194') + , ('\211','\63193') + , ('\212','\63195') + , ('\213','\8719') + , ('\214','\8730') + , ('\215','\8901') + , ('\216','\172') + , ('\217','\8743') + , ('\218','\8744') + , ('\219','\8660') + , ('\220','\8656') + , ('\221','\8657') + , ('\222','\8658') + , ('\223','\8659') + , ('\224','\9674') + , ('\225','\9001') + , ('\226','\63720') + , ('\227','\63721') + , ('\228','\63722') + , ('\229','\8721') + , ('\230','\63723') + , ('\231','\63724') + , ('\232','\63725') + , ('\233','\63726') + , ('\234','\63727') + , ('\235','\63728') + , ('\236','\63729') + , ('\237','\63730') + , ('\238','\63731') + , ('\239','\63732') + , ('\241','\9002') + , ('\242','\8747') + , ('\243','\8992') + , ('\244','\63733') + , ('\245','\8993') + , ('\246','\63734') + , ('\247','\63735') + , ('\248','\63736') + , ('\249','\63737') + , ('\250','\63738') + , ('\251','\63739') + , ('\252','\63740') + , ('\253','\63741') + , ('\254','\63742')] diff --git a/src/Text/Pandoc/Readers/Docx/OMath.hs b/src/Text/Pandoc/Readers/Docx/OMath.hs deleted file mode 100644 index 47f8dd197..000000000 --- a/src/Text/Pandoc/Readers/Docx/OMath.hs +++ /dev/null @@ -1,438 +0,0 @@ -{-# LANGUAGE PatternGuards #-} - -{- -Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.Readers.Docx.Math - Copyright : Copyright (C) 2014 Jesse Rosenthal - License : GNU GPL, version 2 or above - - Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> - Stability : alpha - Portability : portable - -Types and functions for conversion of OMML into TeXMath. --} - -module Text.Pandoc.Readers.Docx.OMath (readOMML - ) where - -import Text.XML.Light -import Data.Maybe (mapMaybe, fromMaybe) -import Data.List (intersperse) -import qualified Text.TeXMath.Types as TM - -readOMML :: String -> Either String [TM.Exp] -readOMML s | Just e <- parseXMLDoc s = - case elemToOMML e of - Just exs -> Right exs - Nothing -> Left "xml file was not an <m:oMathPara> or <m:oMath> element." -readOMML _ = Left "Couldn't parse OMML file" - -elemToOMML :: Element -> Maybe [TM.Exp] -elemToOMML element | isElem "m" "oMathPara" element = do - let expList = mapMaybe elemToOMML (elChildren element) - return $ map (\l -> if length l == 1 then (head l) else TM.EGrouped l) expList -elemToOMML element | isElem "m" "oMath" element = - Just $ concat $ mapMaybe (elemToExps') (elChildren element) -elemToOMML _ = Nothing - -isElem :: String -> String -> Element -> Bool -isElem prefix name element = - let qp = fromMaybe "" (qPrefix (elName element)) - in - qName (elName element) == name && - qp == prefix - -hasElemName:: String -> String -> QName -> Bool -hasElemName prefix name qn = - let qp = fromMaybe "" (qPrefix qn) - in - qName qn == name && - qp == prefix - -data OMathRunElem = TextRun String - | LnBrk - | Tab - deriving Show - -data OMathRunTextStyle = NoStyle - | Normal - | Styled { oMathScript :: Maybe OMathTextScript - , oMathStyle :: Maybe OMathTextStyle } - deriving Show - -data OMathTextScript = ORoman - | OScript - | OFraktur - | ODoubleStruck - | OSansSerif - | OMonospace - deriving (Show, Eq) - -data OMathTextStyle = OPlain - | OBold - | OItalic - | OBoldItalic - deriving (Show, Eq) - -elemToBase :: Element -> Maybe TM.Exp -elemToBase element | isElem "m" "e" element = do - bs <- elemToBases element - return $ case bs of - (e : []) -> e - exps -> TM.EGrouped exps -elemToBase _ = Nothing - -elemToBases :: Element -> Maybe [TM.Exp] -elemToBases element | isElem "m" "e" element = - return $ concat $ mapMaybe elemToExps' (elChildren element) -elemToBases _ = Nothing - - --- TODO: The right way to do this is to use the ampersand to break the --- text lines into multiple columns. That's tricky, though, and this --- will get us most of the way for the time being. -filterAmpersand :: TM.Exp -> TM.Exp -filterAmpersand (TM.EIdentifier s) = TM.EIdentifier (filter ('&' /=) s) -filterAmpersand (TM.EText tt s) = TM.EText tt (filter ('&' /=) s) -filterAmpersand (TM.EStyled tt exps) = TM.EStyled tt (map filterAmpersand exps) -filterAmpersand (TM.EGrouped exps) = TM.EGrouped (map filterAmpersand exps) -filterAmpersand e = e - -elemToOMathRunTextStyle :: Element -> OMathRunTextStyle -elemToOMathRunTextStyle element - | Just mrPr <- filterChildName (hasElemName"m" "rPr") element - , Just _ <- filterChildName (hasElemName"m" "nor") mrPr = - Normal - | Just mrPr <- filterChildName (hasElemName"m" "rPr") element = - let scr = - case - filterChildName (hasElemName"m" "scr") mrPr >>= - findAttrBy (hasElemName"m" "val") - of - Just "roman" -> Just ORoman - Just "script" -> Just OScript - Just "fraktur" -> Just OFraktur - Just "double-struck" -> Just ODoubleStruck - Just "sans-serif" -> Just OSansSerif - Just "monospace" -> Just OMonospace - _ -> Nothing - - sty = - case - filterChildName (hasElemName"m" "sty") mrPr >>= - findAttrBy (hasElemName"m" "val") - of - Just "p" -> Just OPlain - Just "b" -> Just OBold - Just "i" -> Just OItalic - Just "bi" -> Just OBoldItalic - _ -> Nothing - in - Styled { oMathScript = scr, oMathStyle = sty } - | otherwise = NoStyle - -elemToOMathRunElem :: Element -> Maybe OMathRunElem -elemToOMathRunElem element - | isElem "w" "t" element - || isElem "m" "t" element - || isElem "w" "delText" element = Just $ TextRun $ strContent element - | isElem "w" "br" element = Just LnBrk - | isElem "w" "tab" element = Just Tab - | otherwise = Nothing - -elemToOMathRunElems :: Element -> Maybe [OMathRunElem] -elemToOMathRunElems element - | isElem "w" "r" element - || isElem "m" "r" element = - Just $ mapMaybe (elemToOMathRunElem) (elChildren element) -elemToOMathRunElems _ = Nothing - ------ And now the TeXMath Creation - -oMathRunElemToString :: OMathRunElem -> String -oMathRunElemToString (TextRun s) = s -oMathRunElemToString (LnBrk) = ['\n'] -oMathRunElemToString (Tab) = ['\t'] - -oMathRunElemsToString :: [OMathRunElem] -> String -oMathRunElemsToString = concatMap oMathRunElemToString - -oMathRunTextStyleToTextType :: OMathRunTextStyle -> Maybe TM.TextType -oMathRunTextStyleToTextType (Normal) = Just $ TM.TextNormal -oMathRunTextStyleToTextType (NoStyle) = Nothing -oMathRunTextStyleToTextType (Styled scr sty) - | Just OBold <- sty - , Just OSansSerif <- scr = - Just $ TM.TextSansSerifBold - | Just OBoldItalic <- sty - , Just OSansSerif <- scr = - Just $ TM.TextSansSerifBoldItalic - | Just OBold <- sty - , Just OScript <- scr = - Just $ TM.TextBoldScript - | Just OBold <- sty - , Just OFraktur <- scr = - Just $ TM.TextBoldFraktur - | Just OItalic <- sty - , Just OSansSerif <- scr = - Just $ TM.TextSansSerifItalic - | Just OBold <- sty = - Just $ TM.TextBold - | Just OItalic <- sty = - Just $ TM.TextItalic - | Just OMonospace <- scr = - Just $ TM.TextMonospace - | Just OSansSerif <- scr = - Just $ TM.TextSansSerif - | Just ODoubleStruck <- scr = - Just $ TM.TextDoubleStruck - | Just OScript <- scr = - Just $ TM.TextDoubleStruck - | Just OFraktur <- scr = - Just $ TM.TextFraktur - | Just OBoldItalic <- sty = - Just $ TM.TextBoldItalic - | otherwise = Nothing - - -elemToExps' :: Element -> Maybe [TM.Exp] -elemToExps' element | isElem "m" "acc" element = do - let chr = filterChildName (hasElemName "m" "accPr") element >>= - filterChildName (hasElemName "m" "chr") >>= - findAttrBy (hasElemName "m" "val") >>= - Just . head - chr' = case chr of - Just c -> c - Nothing -> '\180' -- default to acute. - baseExp <- filterChildName (hasElemName "m" "e") element >>= - elemToBase - return $ [TM.EOver False baseExp (TM.ESymbol TM.Accent [chr'])] -elemToExps' element | isElem "m" "bar" element = do - pos <- filterChildName (hasElemName "m" "barPr") element >>= - filterChildName (hasElemName "m" "pos") >>= - findAttrBy (hasElemName "m" "val") - baseExp <- filterChildName (hasElemName "m" "e") element >>= - elemToBase - case pos of - "top" -> Just [TM.EOver False baseExp (TM.ESymbol TM.Accent "\175")] - "bot" -> Just [TM.EUnder False baseExp (TM.ESymbol TM.Accent "\818")] - _ -> Nothing -elemToExps' element | isElem "m" "box" element = do - baseExp <- filterChildName (hasElemName "m" "e") element >>= - elemToBase - return [baseExp] -elemToExps' element | isElem "m" "borderBox" element = do - -- TODO: This needs to be "\\boxed" somehow. - baseExp <- filterChildName (hasElemName "m" "e") element >>= - elemToBase - return [baseExp] -elemToExps' element | isElem "m" "d" element = - let baseExps = mapMaybe - elemToBase - (elChildren element) - inDelimExps = map Right baseExps - dPr = filterChildName (hasElemName "m" "dPr") element - begChr = dPr >>= - filterChildName (hasElemName "m" "begChr") >>= - findAttrBy (hasElemName "m" "val") >>= - (\c -> if null c then (Just ' ') else (Just $ head c)) - sepChr = dPr >>= - filterChildName (hasElemName "m" "sepChr") >>= - findAttrBy (hasElemName "m" "val") >>= - (\c -> if null c then (Just ' ') else (Just $ head c)) - endChr = dPr >>= - filterChildName (hasElemName "m" "endChr") >>= - findAttrBy (hasElemName "m" "val") >>= - (\c -> if null c then (Just ' ') else (Just $ head c)) - beg = fromMaybe '(' begChr - end = fromMaybe ')' endChr - sep = fromMaybe '|' sepChr - exps = intersperse (Left [sep]) inDelimExps - in - Just [TM.EDelimited [beg] [end] exps] -elemToExps' element | isElem "m" "eqArr" element = - let expLst = mapMaybe elemToBases (elChildren element) - expLst' = map (\es -> [map filterAmpersand es]) expLst - in - return [TM.EArray [] expLst'] -elemToExps' element | isElem "m" "f" element = do - num <- filterChildName (hasElemName "m" "num") element - den <- filterChildName (hasElemName "m" "den") element - let numExp = TM.EGrouped $ concat $ mapMaybe (elemToExps') (elChildren num) - denExp = TM.EGrouped $ concat $ mapMaybe (elemToExps') (elChildren den) - return $ [TM.EFraction TM.NormalFrac numExp denExp] -elemToExps' element | isElem "m" "func" element = do - fName <- filterChildName (hasElemName "m" "fName") element - baseExp <- filterChildName (hasElemName "m" "e") element >>= - elemToBase - -- We need a string for the fname, but omml gives it to us as a - -- series of oMath elems. We're going to filter out the oMathRuns, - -- which should work for us most of the time. - let fnameString = concatMap expToString $ - concat $ mapMaybe (elemToExps') (elChildren fName) - return [TM.EMathOperator fnameString, baseExp] -elemToExps' element | isElem "m" "groupChr" element = do - let gPr = filterChildName (hasElemName "m" "groupChrPr") element - chr = gPr >>= - filterChildName (hasElemName "m" "chr") >>= - findAttrBy (hasElemName "m" "val") - pos = gPr >>= - filterChildName (hasElemName "m" "pos") >>= - findAttrBy (hasElemName "m" "val") - baseExp <- filterChildName (hasElemName "m" "e") element >>= - elemToBase - case pos of - Just "top" -> - let chr' = case chr of - Just (c:_) -> c - _ -> '\65079' -- default to overbrace - in - return [TM.EOver False baseExp (TM.ESymbol TM.Accent [chr'])] - Just "bot" -> - let chr' = case chr of - Just (c:_) -> c - _ -> '\65080' -- default to underbrace - in - return [TM.EUnder False baseExp (TM.ESymbol TM.Accent [chr'])] - _ -> Nothing -elemToExps' element | isElem "m" "limLow" element = do - baseExp <- filterChildName (hasElemName "m" "e") element - >>= elemToBase - limExp <- filterChildName (hasElemName "m" "lim") element - >>= (\e -> Just $ concat $ mapMaybe (elemToExps') (elChildren e)) - >>= (return . TM.EGrouped) - return [TM.EUnder True limExp baseExp] -elemToExps' element | isElem "m" "limUpp" element = do - baseExp <- filterChildName (hasElemName "m" "e") element - >>= elemToBase - limExp <- filterChildName (hasElemName "m" "lim") element - >>= (\e -> Just $ concat $ mapMaybe (elemToExps') (elChildren e)) - >>= (return . TM.EGrouped) - return [TM.EOver True limExp baseExp] -elemToExps' element | isElem "m" "m" element = - let rows = filterChildrenName (hasElemName "m" "mr") element - rowExps = map - (\mr -> mapMaybe - elemToBases - (elChildren mr)) - rows - in - return [TM.EArray [TM.AlignCenter] rowExps] -elemToExps' element | isElem "m" "nary" element = do - let naryPr = filterChildName (hasElemName "m" "naryPr") element - naryChr = naryPr >>= - filterChildName (hasElemName "m" "chr") >>= - findAttrBy (hasElemName "m" "val") - opChr = case naryChr of - Just (c:_) -> c - _ -> '\8747' -- default to integral - limLoc = naryPr >>= - filterChildName (hasElemName "m" "limLoc") >>= - findAttrBy (hasElemName "m" "val") - subExps <- filterChildName (hasElemName "m" "sub") element >>= - (\e -> return $ concat $ mapMaybe (elemToExps') (elChildren e)) - supExps <- filterChildName (hasElemName "m" "sup") element >>= - (\e -> return $ concat $ mapMaybe (elemToExps') (elChildren e)) - baseExp <- filterChildName (hasElemName "m" "e") element >>= - elemToBase - case limLoc of - Just "undOvr" -> return [TM.EUnderover True - (TM.ESymbol TM.Op [opChr]) - (TM.EGrouped subExps) - (TM.EGrouped supExps) - , baseExp] - _ -> return [TM.ESubsup - (TM.ESymbol TM.Op [opChr]) - (TM.EGrouped subExps) - (TM.EGrouped supExps) - , baseExp] - -elemToExps' element | isElem "m" "phant" element = do - baseExp <- filterChildName (hasElemName "m" "e") element >>= - elemToBase - return [TM.EPhantom baseExp] -elemToExps' element | isElem "m" "rad" element = do - degExps <- filterChildName (hasElemName "m" "deg") element >>= - (\e -> return $ concat $ mapMaybe (elemToExps') (elChildren e)) - baseExp <- filterChildName (hasElemName "m" "e") element >>= - elemToBase - return $ case degExps of - [] -> [TM.ESqrt baseExp] - ds -> [TM.ERoot (TM.EGrouped ds) baseExp] -elemToExps' element | isElem "m" "sPre" element = do - subExps <- filterChildName (hasElemName "m" "sub") element >>= - (\e -> return $ concat $ mapMaybe (elemToExps') (elChildren e)) - supExps <- filterChildName (hasElemName "m" "sup") element >>= - (\e -> return $ concat $ mapMaybe (elemToExps') (elChildren e)) - baseExp <- filterChildName (hasElemName "m" "e") element >>= - elemToBase - return [TM.ESubsup - (TM.EIdentifier "") - (TM.EGrouped subExps) - (TM.EGrouped supExps) - , baseExp] -elemToExps' element | isElem "m" "sSub" element = do - baseExp <- filterChildName (hasElemName "m" "e") element >>= - elemToBase - subExps <- filterChildName (hasElemName "m" "sub") element >>= - (\e -> return $ concat $ mapMaybe (elemToExps') (elChildren e)) - return [TM.ESub baseExp (TM.EGrouped subExps)] -elemToExps' element | isElem "m" "sSubSup" element = do - baseExp <- filterChildName (hasElemName "m" "e") element >>= - elemToBase - subExps <- filterChildName (hasElemName "m" "sub") element >>= - (\e -> return $ concat $ mapMaybe (elemToExps') (elChildren e)) - supExps <- filterChildName (hasElemName "m" "sup") element >>= - (\e -> return $ concat $ mapMaybe (elemToExps') (elChildren e)) - return [TM.ESubsup baseExp (TM.EGrouped subExps) (TM.EGrouped supExps)] -elemToExps' element | isElem "m" "sSup" element = do - baseExp <- filterChildName (hasElemName "m" "e") element >>= - elemToBase - supExps <- filterChildName (hasElemName "m" "sup") element >>= - (\e -> return $ concat $ mapMaybe (elemToExps') (elChildren e)) - return [TM.ESuper baseExp (TM.EGrouped supExps)] -elemToExps' element | isElem "m" "r" element = do - let mrPr = filterChildName (hasElemName "m" "rPr") element - lit = mrPr >>= - filterChildName (hasElemName "m" "lit") >>= - findAttrBy (hasElemName "m" "val") - txtSty = elemToOMathRunTextStyle element - mrElems <- elemToOMathRunElems element - return $ case oMathRunTextStyleToTextType txtSty of - Nothing -> [TM.EIdentifier $ oMathRunElemsToString mrElems] - Just textType -> - case lit of - Just "on" -> - [TM.EText textType (oMathRunElemsToString mrElems)] - _ -> - [TM.EStyled textType [TM.EIdentifier $ oMathRunElemsToString mrElems]] -elemToExps' _ = Nothing - - -expToString :: TM.Exp -> String -expToString (TM.ENumber s) = s -expToString (TM.EIdentifier s) = s -expToString (TM.EMathOperator s) = s -expToString (TM.ESymbol _ s) = s -expToString (TM.EText _ s) = s -expToString (TM.EGrouped exps) = concatMap expToString exps -expToString (TM.EStyled _ exps) = concatMap expToString exps -expToString _ = "" diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index beb58fed2..7d1171ee3 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -59,10 +59,13 @@ import Data.Bits ((.|.)) import qualified Data.ByteString.Lazy as B import qualified Text.Pandoc.UTF8 as UTF8 import Control.Monad.Reader +import Control.Applicative ((<$>)) import qualified Data.Map as M import Text.Pandoc.Compat.Except -import Text.Pandoc.Readers.Docx.OMath (readOMML) +import Text.TeXMath.Readers.OMML (readOMML) +import Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..)) import Text.TeXMath (Exp) +import Data.Char (readLitChar) data ReaderEnv = ReaderEnv { envNotes :: Notes , envNumbering :: Numbering @@ -673,8 +676,27 @@ elemToRunElem ns element return $ TextRun $ strContent element | isElem ns "w" "br" element = return LnBrk | isElem ns "w" "tab" element = return Tab + | isElem ns "w" "sym" element = return (getSymChar ns element) | otherwise = throwError WrongElem +-- The char attribute is a hex string +getSymChar :: NameSpaces -> Element -> RunElem +getSymChar ns element + | Just s <- lowerFromPrivate <$> getCodepoint + , Just font <- getFont = + let [(char, _)] = readLitChar ("\\x" ++ s) in + TextRun . maybe "" (:[]) $ getUnicode font char + where + getCodepoint = findAttr (elemName ns "w" "char") element + getFont = stringToFont =<< findAttr (elemName ns "w" "font") element + lowerFromPrivate ('F':xs) = '0':xs + lowerFromPrivate xs = xs +getSymChar _ _ = TextRun "" + +stringToFont :: String -> Maybe Font +stringToFont "Symbol" = Just Symbol +stringToFont _ = Nothing + elemToRunElems :: NameSpaces -> Element -> D [RunElem] elemToRunElems ns element | isElem ns "w" "r" element diff --git a/tests/docx.unicode.docx b/tests/docx.unicode.docx Binary files differindex 78d0107a1..4360f6be7 100644 --- a/tests/docx.unicode.docx +++ b/tests/docx.unicode.docx diff --git a/tests/docx.unicode.native b/tests/docx.unicode.native index e636355c7..f37bbb1a7 100644 --- a/tests/docx.unicode.native +++ b/tests/docx.unicode.native @@ -1 +1 @@ -[Para [Str "Hello,",Space,Str "\19990\30028.",Space,Str "This",Space,Str "costs",Space,Str "\8364\&10."]] +[Para [Str "Hello,",Space,Str "\19990\30028.",Space,Str "This",Space,Str "costs",Space,Str "\8364\&10.\8744\8744"]] diff --git a/tests/epub.features.native b/tests/epub.features.native index b0cdaee36..d8299d380 100644 --- a/tests/epub.features.native +++ b/tests/epub.features.native @@ -222,7 +222,7 @@ ,RawBlock (Format "html") "<section id=\"mathml-024\" class=\"ctest\">" ,Header 2 ("",[],[]) [Span ("",["nature"],[]) [Str "[REQUIRED]"],Space,Span ("",["test-id"],[]) [Str "mathml-024"],Str "Horizontal",Space,Str "stretch,",Space,Code ("",[],[]) "mover",Str ",",Space,Code ("",[],[]) "munder",Str ",",Space,Str "and",Space,Code ("",[],[]) "mspace",Space,Str "elements"] ,Para [Str "Tests",Space,Str "whether",Space,Str "horizontal",Space,Str "stretch,",Space,Code ("",[],[]) "mover",Str ",",Space,Code ("",[],[]) "munder",Str ",",Space,Code ("",[],[]) "mspace",Space,Str "elements",Space,Str "are",Space,Str "supported."] -,Para [Math DisplayMath "c = \\overset{\\text{complex\\ number}}{\\overbrace{\\underset{\\text{real}}{\\underbrace{\\qquad a\\qquad}} + \\underset{\\text{imaginary}}{\\underbrace{\\quad b{\\mathbb{i}}\\quad}}}}"] +,Para [Math DisplayMath "c = \\overset{\\text{complex\\ number}}{\\overbrace{\\underset{\\text{real}}{\\underbrace{\\mspace{20mu} a\\mspace{20mu}}} + \\underset{\\text{imaginary}}{\\underbrace{\\quad b{\\mathbb{i}}\\quad}}}}"] ,Para [Str "The",Space,Str "test",Space,Str "passes",Space,Str "if",Space,Str "the",Space,Str "rendering",Space,Str "looks",Space,Str "like",Space,Image [Str "description",Space,Str "of",Space,Str "imaginary",Space,Str "number:",Space,Str "c",Space,Str "=",Space,Str "a",Space,Str "+bi",Space,Str "with",Space,Str "an",Space,Str "overbrace",Space,Str "reading",Space,Str "'complex",Space,Str "number'",Space,Str "and",Space,Str "underbraces",Space,Str "below",Space,Str "'a'",Space,Str "and",Space,Str "'b",Space,Str "i'",Space,Str "reading",Space,Str "'real'",Space,Str "and",Space,Str "'imaginary'",Space,Str "respectively."] ("img/complex_number.png",""),Str "."] ,RawBlock (Format "html") "</section>" ,RawBlock (Format "html") "<section id=\"mathml-025\" class=\"ctest\">" |