aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2014-08-09 20:22:06 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2014-08-09 20:22:06 -0700
commiteed1274fcaa0348fd08624eaa863ad26248958de (patch)
tree6000aeb164abda25f1cce788e18e37e5180d7438
parentc2a0d47c7b15483efa565ab2b6d9fa836e3a8818 (diff)
parentecb42cc002f892295ea33711947338580f60522b (diff)
downloadpandoc-eed1274fcaa0348fd08624eaa863ad26248958de.tar.gz
Merge pull request #1509 from jkr/symbol
Parse "w:sym"
-rw-r--r--lib/fonts/Makefile6
-rw-r--r--lib/fonts/parseUnicodeMapping.hs40
-rw-r--r--lib/fonts/symbol.txt256
-rw-r--r--pandoc.cabal1
-rw-r--r--src/Text/Pandoc/Readers/Docx/Fonts.hs237
-rw-r--r--src/Text/Pandoc/Readers/Docx/Parse.hs22
-rw-r--r--tests/docx.unicode.docxbin13098 -> 11472 bytes
-rw-r--r--tests/docx.unicode.native2
8 files changed, 563 insertions, 1 deletions
diff --git a/lib/fonts/Makefile b/lib/fonts/Makefile
new file mode 100644
index 000000000..5693ee054
--- /dev/null
+++ b/lib/fonts/Makefile
@@ -0,0 +1,6 @@
+symbol.hs: symbol.txt
+ runghc parseUnicodeMapping.hs symbol.txt
+
+.PHONY: clean
+clean:
+ -rm symbol.hs
diff --git a/lib/fonts/parseUnicodeMapping.hs b/lib/fonts/parseUnicodeMapping.hs
new file mode 100644
index 000000000..4f7ff692b
--- /dev/null
+++ b/lib/fonts/parseUnicodeMapping.hs
@@ -0,0 +1,40 @@
+import System.FilePath
+import Text.Parsec
+import Data.Char
+import System.Environment
+import Control.Applicative hiding (many)
+import Data.List
+
+main :: IO ()
+main = (head <$> getArgs) >>= parseUnicodeMapping
+
+
+parseUnicodeMapping :: FilePath -> IO ()
+parseUnicodeMapping fname = do
+ fin <- readFile fname
+ let mapname = dropExtension . takeFileName $ fname
+ let res = runParse fin
+ let header = "-- Generated from " ++ fname ++ "\n" ++
+ mapname ++ " :: [(Char, Char)]\n" ++ mapname ++" =\n [ "
+ let footer = "]"
+ writeFile (replaceExtension fname ".hs")
+ (header ++ (concat $ intersperse "\n , " (map show res)) ++ footer)
+
+type Unicode = Char
+
+runParse :: String -> [(Char, Unicode)]
+runParse s= either (error . show) id (parse parseMap "" s)
+
+anyline = manyTill anyChar newline
+
+getHexChar :: Parsec String () Char
+getHexChar = do
+ [(c,_)] <- readLitChar . ("\\x" ++) <$> many1 hexDigit
+ return c
+
+parseMap :: Parsec String () [(Char, Unicode)]
+parseMap = do
+ skipMany (char '#' >> anyline)
+ many (flip (,) <$> getHexChar <* tab <*> getHexChar <* anyline)
+
+
diff --git a/lib/fonts/symbol.txt b/lib/fonts/symbol.txt
new file mode 100644
index 000000000..b98baf6cf
--- /dev/null
+++ b/lib/fonts/symbol.txt
@@ -0,0 +1,256 @@
+#
+# Name: Adobe Symbol Encoding to Unicode
+# Unicode version: 2.0
+# Table version: 1.0
+# Date: 2011 July 12
+#
+# Copyright (c) 1991-2011 Unicode, Inc. All Rights reserved.
+#
+# This file is provided as-is by Unicode, Inc. (The Unicode Consortium). No
+# claims are made as to fitness for any particular purpose. No warranties of
+# any kind are expressed or implied. The recipient agrees to determine
+# applicability of information provided. If this file has been provided on
+# magnetic media by Unicode, Inc., the sole remedy for any claim will be
+# exchange of defective media within 90 days of receipt.
+#
+# Unicode, Inc. hereby grants the right to freely use the information
+# supplied in this file in the creation of products supporting the
+# Unicode Standard, and to make copies of this file in any form for
+# internal or external distribution as long as this notice remains
+# attached.
+#
+# Format: 4 tab-delimited fields:
+#
+# (1) The Unicode value (in hexadecimal)
+# (2) The Symbol Encoding code point (in hexadecimal)
+# (3) # Unicode name
+# (4) # PostScript character name
+#
+# General Notes:
+#
+# The Unicode values in this table were produced as the result of applying
+# the algorithm described in the section "Populating a Unicode space" in the
+# document "Unicode and Glyph Names," at
+# http://partners.adobe.com/asn/developer/typeforum/unicodegn.html
+# to the characters in Symbol. Note that some characters, such as "space",
+# are mapped to 2 Unicode values. 29 characters have assignments in the
+# Corporate Use Subarea; these are indicated by "(CUS)" in field 4. Refer to
+# the above document for more details.
+#
+# 2011 July 12: The above link is no longer valid. For comparable,
+# more current information, see the document, "Glyph", at:
+# <http://www.adobe.com/devnet/opentype/archives/glyph.html>
+#
+# Revision History:
+#
+# [v1.0, 2011 July 12]
+# Updated terms of use to current wording.
+# Updated contact information and document link.
+# No changes to the mapping data.
+#
+# [v0.2, 30 March 1999]
+# Different algorithm to produce Unicode values (see notes above) results in
+# some character codes being mapped to 2 Unicode values; use of Corporate
+# Use subarea values; addition of the euro character; changed assignments of
+# some characters such as the COPYRIGHT SIGNs and RADICAL EXTENDER. Updated
+# Unicode names to Unicode 2.0 names.
+#
+# [v0.1, 5 May 1995] First release.
+#
+# Use the Unicode reporting form <http://www.unicode.org/reporting.html>
+# for any questions or comments or to report errors in the data.
+#
+0020 20 # SPACE # space
+00A0 20 # NO-BREAK SPACE # space
+0021 21 # EXCLAMATION MARK # exclam
+2200 22 # FOR ALL # universal
+0023 23 # NUMBER SIGN # numbersign
+2203 24 # THERE EXISTS # existential
+0025 25 # PERCENT SIGN # percent
+0026 26 # AMPERSAND # ampersand
+220B 27 # CONTAINS AS MEMBER # suchthat
+0028 28 # LEFT PARENTHESIS # parenleft
+0029 29 # RIGHT PARENTHESIS # parenright
+2217 2A # ASTERISK OPERATOR # asteriskmath
+002B 2B # PLUS SIGN # plus
+002C 2C # COMMA # comma
+2212 2D # MINUS SIGN # minus
+002E 2E # FULL STOP # period
+002F 2F # SOLIDUS # slash
+0030 30 # DIGIT ZERO # zero
+0031 31 # DIGIT ONE # one
+0032 32 # DIGIT TWO # two
+0033 33 # DIGIT THREE # three
+0034 34 # DIGIT FOUR # four
+0035 35 # DIGIT FIVE # five
+0036 36 # DIGIT SIX # six
+0037 37 # DIGIT SEVEN # seven
+0038 38 # DIGIT EIGHT # eight
+0039 39 # DIGIT NINE # nine
+003A 3A # COLON # colon
+003B 3B # SEMICOLON # semicolon
+003C 3C # LESS-THAN SIGN # less
+003D 3D # EQUALS SIGN # equal
+003E 3E # GREATER-THAN SIGN # greater
+003F 3F # QUESTION MARK # question
+2245 40 # APPROXIMATELY EQUAL TO # congruent
+0391 41 # GREEK CAPITAL LETTER ALPHA # Alpha
+0392 42 # GREEK CAPITAL LETTER BETA # Beta
+03A7 43 # GREEK CAPITAL LETTER CHI # Chi
+0394 44 # GREEK CAPITAL LETTER DELTA # Delta
+2206 44 # INCREMENT # Delta
+0395 45 # GREEK CAPITAL LETTER EPSILON # Epsilon
+03A6 46 # GREEK CAPITAL LETTER PHI # Phi
+0393 47 # GREEK CAPITAL LETTER GAMMA # Gamma
+0397 48 # GREEK CAPITAL LETTER ETA # Eta
+0399 49 # GREEK CAPITAL LETTER IOTA # Iota
+03D1 4A # GREEK THETA SYMBOL # theta1
+039A 4B # GREEK CAPITAL LETTER KAPPA # Kappa
+039B 4C # GREEK CAPITAL LETTER LAMDA # Lambda
+039C 4D # GREEK CAPITAL LETTER MU # Mu
+039D 4E # GREEK CAPITAL LETTER NU # Nu
+039F 4F # GREEK CAPITAL LETTER OMICRON # Omicron
+03A0 50 # GREEK CAPITAL LETTER PI # Pi
+0398 51 # GREEK CAPITAL LETTER THETA # Theta
+03A1 52 # GREEK CAPITAL LETTER RHO # Rho
+03A3 53 # GREEK CAPITAL LETTER SIGMA # Sigma
+03A4 54 # GREEK CAPITAL LETTER TAU # Tau
+03A5 55 # GREEK CAPITAL LETTER UPSILON # Upsilon
+03C2 56 # GREEK SMALL LETTER FINAL SIGMA # sigma1
+03A9 57 # GREEK CAPITAL LETTER OMEGA # Omega
+2126 57 # OHM SIGN # Omega
+039E 58 # GREEK CAPITAL LETTER XI # Xi
+03A8 59 # GREEK CAPITAL LETTER PSI # Psi
+0396 5A # GREEK CAPITAL LETTER ZETA # Zeta
+005B 5B # LEFT SQUARE BRACKET # bracketleft
+2234 5C # THEREFORE # therefore
+005D 5D # RIGHT SQUARE BRACKET # bracketright
+22A5 5E # UP TACK # perpendicular
+005F 5F # LOW LINE # underscore
+F8E5 60 # RADICAL EXTENDER # radicalex (CUS)
+03B1 61 # GREEK SMALL LETTER ALPHA # alpha
+03B2 62 # GREEK SMALL LETTER BETA # beta
+03C7 63 # GREEK SMALL LETTER CHI # chi
+03B4 64 # GREEK SMALL LETTER DELTA # delta
+03B5 65 # GREEK SMALL LETTER EPSILON # epsilon
+03C6 66 # GREEK SMALL LETTER PHI # phi
+03B3 67 # GREEK SMALL LETTER GAMMA # gamma
+03B7 68 # GREEK SMALL LETTER ETA # eta
+03B9 69 # GREEK SMALL LETTER IOTA # iota
+03D5 6A # GREEK PHI SYMBOL # phi1
+03BA 6B # GREEK SMALL LETTER KAPPA # kappa
+03BB 6C # GREEK SMALL LETTER LAMDA # lambda
+00B5 6D # MICRO SIGN # mu
+03BC 6D # GREEK SMALL LETTER MU # mu
+03BD 6E # GREEK SMALL LETTER NU # nu
+03BF 6F # GREEK SMALL LETTER OMICRON # omicron
+03C0 70 # GREEK SMALL LETTER PI # pi
+03B8 71 # GREEK SMALL LETTER THETA # theta
+03C1 72 # GREEK SMALL LETTER RHO # rho
+03C3 73 # GREEK SMALL LETTER SIGMA # sigma
+03C4 74 # GREEK SMALL LETTER TAU # tau
+03C5 75 # GREEK SMALL LETTER UPSILON # upsilon
+03D6 76 # GREEK PI SYMBOL # omega1
+03C9 77 # GREEK SMALL LETTER OMEGA # omega
+03BE 78 # GREEK SMALL LETTER XI # xi
+03C8 79 # GREEK SMALL LETTER PSI # psi
+03B6 7A # GREEK SMALL LETTER ZETA # zeta
+007B 7B # LEFT CURLY BRACKET # braceleft
+007C 7C # VERTICAL LINE # bar
+007D 7D # RIGHT CURLY BRACKET # braceright
+223C 7E # TILDE OPERATOR # similar
+20AC A0 # EURO SIGN # Euro
+03D2 A1 # GREEK UPSILON WITH HOOK SYMBOL # Upsilon1
+2032 A2 # PRIME # minute
+2264 A3 # LESS-THAN OR EQUAL TO # lessequal
+2044 A4 # FRACTION SLASH # fraction
+2215 A4 # DIVISION SLASH # fraction
+221E A5 # INFINITY # infinity
+0192 A6 # LATIN SMALL LETTER F WITH HOOK # florin
+2663 A7 # BLACK CLUB SUIT # club
+2666 A8 # BLACK DIAMOND SUIT # diamond
+2665 A9 # BLACK HEART SUIT # heart
+2660 AA # BLACK SPADE SUIT # spade
+2194 AB # LEFT RIGHT ARROW # arrowboth
+2190 AC # LEFTWARDS ARROW # arrowleft
+2191 AD # UPWARDS ARROW # arrowup
+2192 AE # RIGHTWARDS ARROW # arrowright
+2193 AF # DOWNWARDS ARROW # arrowdown
+00B0 B0 # DEGREE SIGN # degree
+00B1 B1 # PLUS-MINUS SIGN # plusminus
+2033 B2 # DOUBLE PRIME # second
+2265 B3 # GREATER-THAN OR EQUAL TO # greaterequal
+00D7 B4 # MULTIPLICATION SIGN # multiply
+221D B5 # PROPORTIONAL TO # proportional
+2202 B6 # PARTIAL DIFFERENTIAL # partialdiff
+2022 B7 # BULLET # bullet
+00F7 B8 # DIVISION SIGN # divide
+2260 B9 # NOT EQUAL TO # notequal
+2261 BA # IDENTICAL TO # equivalence
+2248 BB # ALMOST EQUAL TO # approxequal
+2026 BC # HORIZONTAL ELLIPSIS # ellipsis
+F8E6 BD # VERTICAL ARROW EXTENDER # arrowvertex (CUS)
+F8E7 BE # HORIZONTAL ARROW EXTENDER # arrowhorizex (CUS)
+21B5 BF # DOWNWARDS ARROW WITH CORNER LEFTWARDS # carriagereturn
+2135 C0 # ALEF SYMBOL # aleph
+2111 C1 # BLACK-LETTER CAPITAL I # Ifraktur
+211C C2 # BLACK-LETTER CAPITAL R # Rfraktur
+2118 C3 # SCRIPT CAPITAL P # weierstrass
+2297 C4 # CIRCLED TIMES # circlemultiply
+2295 C5 # CIRCLED PLUS # circleplus
+2205 C6 # EMPTY SET # emptyset
+2229 C7 # INTERSECTION # intersection
+222A C8 # UNION # union
+2283 C9 # SUPERSET OF # propersuperset
+2287 CA # SUPERSET OF OR EQUAL TO # reflexsuperset
+2284 CB # NOT A SUBSET OF # notsubset
+2282 CC # SUBSET OF # propersubset
+2286 CD # SUBSET OF OR EQUAL TO # reflexsubset
+2208 CE # ELEMENT OF # element
+2209 CF # NOT AN ELEMENT OF # notelement
+2220 D0 # ANGLE # angle
+2207 D1 # NABLA # gradient
+F6DA D2 # REGISTERED SIGN SERIF # registerserif (CUS)
+F6D9 D3 # COPYRIGHT SIGN SERIF # copyrightserif (CUS)
+F6DB D4 # TRADE MARK SIGN SERIF # trademarkserif (CUS)
+220F D5 # N-ARY PRODUCT # product
+221A D6 # SQUARE ROOT # radical
+22C5 D7 # DOT OPERATOR # dotmath
+00AC D8 # NOT SIGN # logicalnot
+2227 D9 # LOGICAL AND # logicaland
+2228 DA # LOGICAL OR # logicalor
+21D4 DB # LEFT RIGHT DOUBLE ARROW # arrowdblboth
+21D0 DC # LEFTWARDS DOUBLE ARROW # arrowdblleft
+21D1 DD # UPWARDS DOUBLE ARROW # arrowdblup
+21D2 DE # RIGHTWARDS DOUBLE ARROW # arrowdblright
+21D3 DF # DOWNWARDS DOUBLE ARROW # arrowdbldown
+25CA E0 # LOZENGE # lozenge
+2329 E1 # LEFT-POINTING ANGLE BRACKET # angleleft
+F8E8 E2 # REGISTERED SIGN SANS SERIF # registersans (CUS)
+F8E9 E3 # COPYRIGHT SIGN SANS SERIF # copyrightsans (CUS)
+F8EA E4 # TRADE MARK SIGN SANS SERIF # trademarksans (CUS)
+2211 E5 # N-ARY SUMMATION # summation
+F8EB E6 # LEFT PAREN TOP # parenlefttp (CUS)
+F8EC E7 # LEFT PAREN EXTENDER # parenleftex (CUS)
+F8ED E8 # LEFT PAREN BOTTOM # parenleftbt (CUS)
+F8EE E9 # LEFT SQUARE BRACKET TOP # bracketlefttp (CUS)
+F8EF EA # LEFT SQUARE BRACKET EXTENDER # bracketleftex (CUS)
+F8F0 EB # LEFT SQUARE BRACKET BOTTOM # bracketleftbt (CUS)
+F8F1 EC # LEFT CURLY BRACKET TOP # bracelefttp (CUS)
+F8F2 ED # LEFT CURLY BRACKET MID # braceleftmid (CUS)
+F8F3 EE # LEFT CURLY BRACKET BOTTOM # braceleftbt (CUS)
+F8F4 EF # CURLY BRACKET EXTENDER # braceex (CUS)
+232A F1 # RIGHT-POINTING ANGLE BRACKET # angleright
+222B F2 # INTEGRAL # integral
+2320 F3 # TOP HALF INTEGRAL # integraltp
+F8F5 F4 # INTEGRAL EXTENDER # integralex (CUS)
+2321 F5 # BOTTOM HALF INTEGRAL # integralbt
+F8F6 F6 # RIGHT PAREN TOP # parenrighttp (CUS)
+F8F7 F7 # RIGHT PAREN EXTENDER # parenrightex (CUS)
+F8F8 F8 # RIGHT PAREN BOTTOM # parenrightbt (CUS)
+F8F9 F9 # RIGHT SQUARE BRACKET TOP # bracketrighttp (CUS)
+F8FA FA # RIGHT SQUARE BRACKET EXTENDER # bracketrightex (CUS)
+F8FB FB # RIGHT SQUARE BRACKET BOTTOM # bracketrightbt (CUS)
+F8FC FC # RIGHT CURLY BRACKET TOP # bracerighttp (CUS)
+F8FD FD # RIGHT CURLY BRACKET MID # bracerightmid (CUS)
+F8FE FE # RIGHT CURLY BRACKET BOTTOM # bracerightbt (CUS)
diff --git a/pandoc.cabal b/pandoc.cabal
index c2e742faa..d11e6f659 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -327,6 +327,7 @@ Library
Text.Pandoc.Readers.Docx.Reducible,
Text.Pandoc.Readers.Docx.Parse,
Text.Pandoc.Readers.Docx.OMath,
+ Text.Pandoc.Readers.Docx.Fonts
Text.Pandoc.Writers.Shared,
Text.Pandoc.Asciify,
Text.Pandoc.MIME,
diff --git a/src/Text/Pandoc/Readers/Docx/Fonts.hs b/src/Text/Pandoc/Readers/Docx/Fonts.hs
new file mode 100644
index 000000000..cd56eb115
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Docx/Fonts.hs
@@ -0,0 +1,237 @@
+{-
+Copyright (C) 2014 Matthew Pickering <matthewtpickering@gmail.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.Docx.Fonts
+ Copyright : Copyright (C) 2014 Matthew Pickering
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Matthew Pickering <matthewtpickering@gmail.com>
+ Stability : alpha
+ Portability : portable
+
+Utilities to convert between font codepoints and unicode characters.
+-}
+module Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..)) where
+
+
+-- | Enumeration of recognised fonts
+data Font = Symbol -- ^ <http://en.wikipedia.org/wiki/Symbol_(typeface) Adobe Symbol>
+
+-- | Given a font and codepoint, returns the corresponding unicode
+-- character
+getUnicode :: Font -> Char -> Maybe Char
+getUnicode Symbol c = lookup c symbol
+
+-- Generated from lib/fonts/symbol.txt
+symbol :: [(Char, Char)]
+symbol =
+ [ (' ',' ')
+ , (' ','\160')
+ , ('!','!')
+ , ('"','\8704')
+ , ('#','#')
+ , ('$','\8707')
+ , ('%','%')
+ , ('&','&')
+ , ('\'','\8715')
+ , ('(','(')
+ , (')',')')
+ , ('*','\8727')
+ , ('+','+')
+ , (',',',')
+ , ('-','\8722')
+ , ('.','.')
+ , ('/','/')
+ , ('0','0')
+ , ('1','1')
+ , ('2','2')
+ , ('3','3')
+ , ('4','4')
+ , ('5','5')
+ , ('6','6')
+ , ('7','7')
+ , ('8','8')
+ , ('9','9')
+ , (':',':')
+ , (';',';')
+ , ('<','<')
+ , ('=','=')
+ , ('>','>')
+ , ('?','?')
+ , ('@','\8773')
+ , ('A','\913')
+ , ('B','\914')
+ , ('C','\935')
+ , ('D','\916')
+ , ('D','\8710')
+ , ('E','\917')
+ , ('F','\934')
+ , ('G','\915')
+ , ('H','\919')
+ , ('I','\921')
+ , ('J','\977')
+ , ('K','\922')
+ , ('L','\923')
+ , ('M','\924')
+ , ('N','\925')
+ , ('O','\927')
+ , ('P','\928')
+ , ('Q','\920')
+ , ('R','\929')
+ , ('S','\931')
+ , ('T','\932')
+ , ('U','\933')
+ , ('V','\962')
+ , ('W','\937')
+ , ('W','\8486')
+ , ('X','\926')
+ , ('Y','\936')
+ , ('Z','\918')
+ , ('[','[')
+ , ('\\','\8756')
+ , (']',']')
+ , ('^','\8869')
+ , ('_','_')
+ , ('`','\63717')
+ , ('a','\945')
+ , ('b','\946')
+ , ('c','\967')
+ , ('d','\948')
+ , ('e','\949')
+ , ('f','\966')
+ , ('g','\947')
+ , ('h','\951')
+ , ('i','\953')
+ , ('j','\981')
+ , ('k','\954')
+ , ('l','\955')
+ , ('m','\181')
+ , ('m','\956')
+ , ('n','\957')
+ , ('o','\959')
+ , ('p','\960')
+ , ('q','\952')
+ , ('r','\961')
+ , ('s','\963')
+ , ('t','\964')
+ , ('u','\965')
+ , ('v','\982')
+ , ('w','\969')
+ , ('x','\958')
+ , ('y','\968')
+ , ('z','\950')
+ , ('{','{')
+ , ('|','|')
+ , ('}','}')
+ , ('~','\8764')
+ , ('\160','\8364')
+ , ('\161','\978')
+ , ('\162','\8242')
+ , ('\163','\8804')
+ , ('\164','\8260')
+ , ('\164','\8725')
+ , ('\165','\8734')
+ , ('\166','\402')
+ , ('\167','\9827')
+ , ('\168','\9830')
+ , ('\169','\9829')
+ , ('\170','\9824')
+ , ('\171','\8596')
+ , ('\172','\8592')
+ , ('\173','\8593')
+ , ('\174','\8594')
+ , ('\175','\8595')
+ , ('\176','\176')
+ , ('\177','\177')
+ , ('\178','\8243')
+ , ('\179','\8805')
+ , ('\180','\215')
+ , ('\181','\8733')
+ , ('\182','\8706')
+ , ('\183','\8226')
+ , ('\184','\247')
+ , ('\185','\8800')
+ , ('\186','\8801')
+ , ('\187','\8776')
+ , ('\188','\8230')
+ , ('\189','\63718')
+ , ('\190','\63719')
+ , ('\191','\8629')
+ , ('\192','\8501')
+ , ('\193','\8465')
+ , ('\194','\8476')
+ , ('\195','\8472')
+ , ('\196','\8855')
+ , ('\197','\8853')
+ , ('\198','\8709')
+ , ('\199','\8745')
+ , ('\200','\8746')
+ , ('\201','\8835')
+ , ('\202','\8839')
+ , ('\203','\8836')
+ , ('\204','\8834')
+ , ('\205','\8838')
+ , ('\206','\8712')
+ , ('\207','\8713')
+ , ('\208','\8736')
+ , ('\209','\8711')
+ , ('\210','\63194')
+ , ('\211','\63193')
+ , ('\212','\63195')
+ , ('\213','\8719')
+ , ('\214','\8730')
+ , ('\215','\8901')
+ , ('\216','\172')
+ , ('\217','\8743')
+ , ('\218','\8744')
+ , ('\219','\8660')
+ , ('\220','\8656')
+ , ('\221','\8657')
+ , ('\222','\8658')
+ , ('\223','\8659')
+ , ('\224','\9674')
+ , ('\225','\9001')
+ , ('\226','\63720')
+ , ('\227','\63721')
+ , ('\228','\63722')
+ , ('\229','\8721')
+ , ('\230','\63723')
+ , ('\231','\63724')
+ , ('\232','\63725')
+ , ('\233','\63726')
+ , ('\234','\63727')
+ , ('\235','\63728')
+ , ('\236','\63729')
+ , ('\237','\63730')
+ , ('\238','\63731')
+ , ('\239','\63732')
+ , ('\241','\9002')
+ , ('\242','\8747')
+ , ('\243','\8992')
+ , ('\244','\63733')
+ , ('\245','\8993')
+ , ('\246','\63734')
+ , ('\247','\63735')
+ , ('\248','\63736')
+ , ('\249','\63737')
+ , ('\250','\63738')
+ , ('\251','\63739')
+ , ('\252','\63740')
+ , ('\253','\63741')
+ , ('\254','\63742')]
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
index beb58fed2..b12062407 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -59,10 +59,13 @@ import Data.Bits ((.|.))
import qualified Data.ByteString.Lazy as B
import qualified Text.Pandoc.UTF8 as UTF8
import Control.Monad.Reader
+import Control.Applicative ((<$>))
import qualified Data.Map as M
import Text.Pandoc.Compat.Except
import Text.Pandoc.Readers.Docx.OMath (readOMML)
+import Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..))
import Text.TeXMath (Exp)
+import Data.Char (readLitChar)
data ReaderEnv = ReaderEnv { envNotes :: Notes
, envNumbering :: Numbering
@@ -673,8 +676,27 @@ elemToRunElem ns element
return $ TextRun $ strContent element
| isElem ns "w" "br" element = return LnBrk
| isElem ns "w" "tab" element = return Tab
+ | isElem ns "w" "sym" element = return (getSymChar ns element)
| otherwise = throwError WrongElem
+-- The char attribute is a hex string
+getSymChar :: NameSpaces -> Element -> RunElem
+getSymChar ns element
+ | Just s <- lowerFromPrivate <$> getCodepoint
+ , Just font <- getFont =
+ let [(char, _)] = readLitChar ("\\x" ++ s) in
+ TextRun . maybe "" (:[]) $ getUnicode font char
+ where
+ getCodepoint = findAttr (elemName ns "w" "char") element
+ getFont = stringToFont =<< findAttr (elemName ns "w" "font") element
+ lowerFromPrivate ('F':xs) = '0':xs
+ lowerFromPrivate xs = xs
+getSymChar _ _ = TextRun ""
+
+stringToFont :: String -> Maybe Font
+stringToFont "Symbol" = Just Symbol
+stringToFont _ = Nothing
+
elemToRunElems :: NameSpaces -> Element -> D [RunElem]
elemToRunElems ns element
| isElem ns "w" "r" element
diff --git a/tests/docx.unicode.docx b/tests/docx.unicode.docx
index 78d0107a1..4360f6be7 100644
--- a/tests/docx.unicode.docx
+++ b/tests/docx.unicode.docx
Binary files differ
diff --git a/tests/docx.unicode.native b/tests/docx.unicode.native
index e636355c7..f37bbb1a7 100644
--- a/tests/docx.unicode.native
+++ b/tests/docx.unicode.native
@@ -1 +1 @@
-[Para [Str "Hello,",Space,Str "\19990\30028.",Space,Str "This",Space,Str "costs",Space,Str "\8364\&10."]]
+[Para [Str "Hello,",Space,Str "\19990\30028.",Space,Str "This",Space,Str "costs",Space,Str "\8364\&10.\8744\8744"]]