Merge pull request #1509 from jkr/symbol

Parse "w:sym"
author: John MacFarlane <jgm@berkeley.edu> 2014-08-09 20:22:06 -0700
committer: John MacFarlane <jgm@berkeley.edu> 2014-08-09 20:22:06 -0700
commit: eed1274fcaa0348fd08624eaa863ad26248958de (patch)
tree: 6000aeb164abda25f1cce788e18e37e5180d7438
parent: c2a0d47c7b15483efa565ab2b6d9fa836e3a8818 (diff)
parent: ecb42cc002f892295ea33711947338580f60522b (diff)
download: pandoc-eed1274fcaa0348fd08624eaa863ad26248958de.tar.gz
8 files changed, 563 insertions, 1 deletions
diff --git a/lib/fonts/Makefile b/lib/fonts/Makefile
new file mode 100644
index 000000000..5693ee054
--- /dev/null
+++ b/lib/fonts/Makefile
@@ -0,0 +1,6 @@
+symbol.hs: symbol.txt
+	runghc parseUnicodeMapping.hs symbol.txt
+
+.PHONY: clean
+clean:
+	-rm symbol.hs
diff --git a/lib/fonts/parseUnicodeMapping.hs b/lib/fonts/parseUnicodeMapping.hs
new file mode 100644
index 000000000..4f7ff692b
--- /dev/null
+++ b/lib/fonts/parseUnicodeMapping.hs
@@ -0,0 +1,40 @@
+import System.FilePath
+import Text.Parsec
+import Data.Char
+import System.Environment
+import Control.Applicative hiding (many)
+import Data.List
+
+main :: IO ()
+main = (head <$> getArgs) >>= parseUnicodeMapping
+
+
+parseUnicodeMapping :: FilePath -> IO ()
+parseUnicodeMapping fname = do
+  fin <- readFile fname
+  let mapname = dropExtension . takeFileName $ fname
+  let res = runParse fin
+  let header = "-- Generated from " ++ fname ++ "\n" ++
+                mapname ++ " :: [(Char, Char)]\n" ++ mapname ++" =\n  [ "
+  let footer = "]"
+  writeFile (replaceExtension fname ".hs")
+    (header ++ (concat $ intersperse "\n  , " (map show res)) ++ footer)
+
+type Unicode = Char
+
+runParse :: String -> [(Char, Unicode)]
+runParse s=  either (error . show) id (parse parseMap "" s)
+
+anyline = manyTill anyChar newline
+
+getHexChar :: Parsec String () Char
+getHexChar = do
+  [(c,_)] <- readLitChar . ("\\x" ++) <$> many1 hexDigit
+  return c
+
+parseMap :: Parsec String () [(Char, Unicode)]
+parseMap = do
+  skipMany (char '#' >> anyline)
+  many (flip (,) <$> getHexChar <* tab <*> getHexChar <* anyline)
+
+
diff --git a/lib/fonts/symbol.txt b/lib/fonts/symbol.txt
new file mode 100644
index 000000000..b98baf6cf
--- /dev/null
+++ b/lib/fonts/symbol.txt
@@ -0,0 +1,256 @@
+#
+#  Name:             Adobe Symbol Encoding to Unicode
+#  Unicode version:  2.0
+#  Table version:    1.0
+#  Date:             2011 July 12
+#  
+#  Copyright (c) 1991-2011 Unicode, Inc. All Rights reserved.
+#  
+#  This file is provided as-is by Unicode, Inc. (The Unicode Consortium). No
+#  claims are made as to fitness for any particular purpose. No warranties of
+#  any kind are expressed or implied. The recipient agrees to determine
+#  applicability of information provided. If this file has been provided on
+#  magnetic media by Unicode, Inc., the sole remedy for any claim will be
+#  exchange of defective media within 90 days of receipt.
+#  
+#  Unicode, Inc. hereby grants the right to freely use the information
+#  supplied in this file in the creation of products supporting the
+#  Unicode Standard, and to make copies of this file in any form for
+#  internal or external distribution as long as this notice remains
+#  attached.
+#  
+#  Format: 4 tab-delimited fields:
+#
+#    (1) The Unicode value (in hexadecimal)
+#    (2) The Symbol Encoding code point (in hexadecimal)
+#    (3) # Unicode name
+#    (4) # PostScript character name
+#  
+#  General Notes:
+# 
+#    The Unicode values in this table were produced as the result of applying
+#    the algorithm described in the section "Populating a Unicode space" in the
+#    document "Unicode and Glyph Names," at
+#    http://partners.adobe.com/asn/developer/typeforum/unicodegn.html
+#    to the characters in Symbol. Note that some characters, such as "space",
+#    are mapped to 2 Unicode values. 29 characters have assignments in the
+#    Corporate Use Subarea; these are indicated by "(CUS)" in field 4. Refer to
+#    the above document for more details.
+#
+#    2011 July 12: The above link is no longer valid. For comparable,
+#    more current information, see the document, "Glyph", at:
+#    <http://www.adobe.com/devnet/opentype/archives/glyph.html>
+#
+#  Revision History:
+#
+#    [v1.0, 2011 July 12]
+#    Updated terms of use to current wording.
+#    Updated contact information and document link.
+#    No changes to the mapping data.
+#
+#    [v0.2, 30 March 1999]
+#    Different algorithm to produce Unicode values (see notes above) results in
+#    some character codes being mapped to 2 Unicode values; use of Corporate
+#    Use subarea values; addition of the euro character; changed assignments of
+#    some characters such as the COPYRIGHT SIGNs and RADICAL EXTENDER. Updated
+#    Unicode names to Unicode 2.0 names.
+#
+#    [v0.1, 5 May 1995] First release.
+#
+#  Use the Unicode reporting form <http://www.unicode.org/reporting.html>
+#    for any questions or comments or to report errors in the data.
+#
+0020	20	# SPACE	# space
+00A0	20	# NO-BREAK SPACE	# space
+0021	21	# EXCLAMATION MARK	# exclam
+2200	22	# FOR ALL	# universal
+0023	23	# NUMBER SIGN	# numbersign
+2203	24	# THERE EXISTS	# existential
+0025	25	# PERCENT SIGN	# percent
+0026	26	# AMPERSAND	# ampersand
+220B	27	# CONTAINS AS MEMBER	# suchthat
+0028	28	# LEFT PARENTHESIS	# parenleft
+0029	29	# RIGHT PARENTHESIS	# parenright
+2217	2A	# ASTERISK OPERATOR	# asteriskmath
+002B	2B	# PLUS SIGN	# plus
+002C	2C	# COMMA	# comma
+2212	2D	# MINUS SIGN	# minus
+002E	2E	# FULL STOP	# period
+002F	2F	# SOLIDUS	# slash
+0030	30	# DIGIT ZERO	# zero
+0031	31	# DIGIT ONE	# one
+0032	32	# DIGIT TWO	# two
+0033	33	# DIGIT THREE	# three
+0034	34	# DIGIT FOUR	# four
+0035	35	# DIGIT FIVE	# five
+0036	36	# DIGIT SIX	# six
+0037	37	# DIGIT SEVEN	# seven
+0038	38	# DIGIT EIGHT	# eight
+0039	39	# DIGIT NINE	# nine
+003A	3A	# COLON	# colon
+003B	3B	# SEMICOLON	# semicolon
+003C	3C	# LESS-THAN SIGN	# less
+003D	3D	# EQUALS SIGN	# equal
+003E	3E	# GREATER-THAN SIGN	# greater
+003F	3F	# QUESTION MARK	# question
+2245	40	# APPROXIMATELY EQUAL TO	# congruent
+0391	41	# GREEK CAPITAL LETTER ALPHA	# Alpha
+0392	42	# GREEK CAPITAL LETTER BETA	# Beta
+03A7	43	# GREEK CAPITAL LETTER CHI	# Chi
+0394	44	# GREEK CAPITAL LETTER DELTA	# Delta
+2206	44	# INCREMENT	# Delta
+0395	45	# GREEK CAPITAL LETTER EPSILON	# Epsilon
+03A6	46	# GREEK CAPITAL LETTER PHI	# Phi
+0393	47	# GREEK CAPITAL LETTER GAMMA	# Gamma
+0397	48	# GREEK CAPITAL LETTER ETA	# Eta
+0399	49	# GREEK CAPITAL LETTER IOTA	# Iota
+03D1	4A	# GREEK THETA SYMBOL	# theta1
+039A	4B	# GREEK CAPITAL LETTER KAPPA	# Kappa
+039B	4C	# GREEK CAPITAL LETTER LAMDA	# Lambda
+039C	4D	# GREEK CAPITAL LETTER MU	# Mu
+039D	4E	# GREEK CAPITAL LETTER NU	# Nu
+039F	4F	# GREEK CAPITAL LETTER OMICRON	# Omicron
+03A0	50	# GREEK CAPITAL LETTER PI	# Pi
+0398	51	# GREEK CAPITAL LETTER THETA	# Theta
+03A1	52	# GREEK CAPITAL LETTER RHO	# Rho
+03A3	53	# GREEK CAPITAL LETTER SIGMA	# Sigma
+03A4	54	# GREEK CAPITAL LETTER TAU	# Tau
+03A5	55	# GREEK CAPITAL LETTER UPSILON	# Upsilon
+03C2	56	# GREEK SMALL LETTER FINAL SIGMA	# sigma1
+03A9	57	# GREEK CAPITAL LETTER OMEGA	# Omega
+2126	57	# OHM SIGN	# Omega
+039E	58	# GREEK CAPITAL LETTER XI	# Xi
+03A8	59	# GREEK CAPITAL LETTER PSI	# Psi
+0396	5A	# GREEK CAPITAL LETTER ZETA	# Zeta
+005B	5B	# LEFT SQUARE BRACKET	# bracketleft
+2234	5C	# THEREFORE	# therefore
+005D	5D	# RIGHT SQUARE BRACKET	# bracketright
+22A5	5E	# UP TACK	# perpendicular
+005F	5F	# LOW LINE	# underscore
+F8E5	60	# RADICAL EXTENDER	# radicalex (CUS)
+03B1	61	# GREEK SMALL LETTER ALPHA	# alpha
+03B2	62	# GREEK SMALL LETTER BETA	# beta
+03C7	63	# GREEK SMALL LETTER CHI	# chi
+03B4	64	# GREEK SMALL LETTER DELTA	# delta
+03B5	65	# GREEK SMALL LETTER EPSILON	# epsilon
+03C6	66	# GREEK SMALL LETTER PHI	# phi
+03B3	67	# GREEK SMALL LETTER GAMMA	# gamma
+03B7	68	# GREEK SMALL LETTER ETA	# eta
+03B9	69	# GREEK SMALL LETTER IOTA	# iota
+03D5	6A	# GREEK PHI SYMBOL	# phi1
+03BA	6B	# GREEK SMALL LETTER KAPPA	# kappa
+03BB	6C	# GREEK SMALL LETTER LAMDA	# lambda
+00B5	6D	# MICRO SIGN	# mu
+03BC	6D	# GREEK SMALL LETTER MU	# mu
+03BD	6E	# GREEK SMALL LETTER NU	# nu
+03BF	6F	# GREEK SMALL LETTER OMICRON	# omicron
+03C0	70	# GREEK SMALL LETTER PI	# pi
+03B8	71	# GREEK SMALL LETTER THETA	# theta
+03C1	72	# GREEK SMALL LETTER RHO	# rho
+03C3	73	# GREEK SMALL LETTER SIGMA	# sigma
+03C4	74	# GREEK SMALL LETTER TAU	# tau
+03C5	75	# GREEK SMALL LETTER UPSILON	# upsilon
+03D6	76	# GREEK PI SYMBOL	# omega1
+03C9	77	# GREEK SMALL LETTER OMEGA	# omega
+03BE	78	# GREEK SMALL LETTER XI	# xi
+03C8	79	# GREEK SMALL LETTER PSI	# psi
+03B6	7A	# GREEK SMALL LETTER ZETA	# zeta
+007B	7B	# LEFT CURLY BRACKET	# braceleft
+007C	7C	# VERTICAL LINE	# bar
+007D	7D	# RIGHT CURLY BRACKET	# braceright
+223C	7E	# TILDE OPERATOR	# similar
+20AC	A0	# EURO SIGN	# Euro
+03D2	A1	# GREEK UPSILON WITH HOOK SYMBOL	# Upsilon1
+2032	A2	# PRIME	# minute
+2264	A3	# LESS-THAN OR EQUAL TO	# lessequal
+2044	A4	# FRACTION SLASH	# fraction
+2215	A4	# DIVISION SLASH	# fraction
+221E	A5	# INFINITY	# infinity
+0192	A6	# LATIN SMALL LETTER F WITH HOOK	# florin
+2663	A7	# BLACK CLUB SUIT	# club
+2666	A8	# BLACK DIAMOND SUIT	# diamond
+2665	A9	# BLACK HEART SUIT	# heart
+2660	AA	# BLACK SPADE SUIT	# spade
+2194	AB	# LEFT RIGHT ARROW	# arrowboth
+2190	AC	# LEFTWARDS ARROW	# arrowleft
+2191	AD	# UPWARDS ARROW	# arrowup
+2192	AE	# RIGHTWARDS ARROW	# arrowright
+2193	AF	# DOWNWARDS ARROW	# arrowdown
+00B0	B0	# DEGREE SIGN	# degree
+00B1	B1	# PLUS-MINUS SIGN	# plusminus
+2033	B2	# DOUBLE PRIME	# second
+2265	B3	# GREATER-THAN OR EQUAL TO	# greaterequal
+00D7	B4	# MULTIPLICATION SIGN	# multiply
+221D	B5	# PROPORTIONAL TO	# proportional
+2202	B6	# PARTIAL DIFFERENTIAL	# partialdiff
+2022	B7	# BULLET	# bullet
+00F7	B8	# DIVISION SIGN	# divide
+2260	B9	# NOT EQUAL TO	# notequal
+2261	BA	# IDENTICAL TO	# equivalence
+2248	BB	# ALMOST EQUAL TO	# approxequal
+2026	BC	# HORIZONTAL ELLIPSIS	# ellipsis
+F8E6	BD	# VERTICAL ARROW EXTENDER	# arrowvertex (CUS)
+F8E7	BE	# HORIZONTAL ARROW EXTENDER	# arrowhorizex (CUS)
+21B5	BF	# DOWNWARDS ARROW WITH CORNER LEFTWARDS	# carriagereturn
+2135	C0	# ALEF SYMBOL	# aleph
+2111	C1	# BLACK-LETTER CAPITAL I	# Ifraktur
+211C	C2	# BLACK-LETTER CAPITAL R	# Rfraktur
+2118	C3	# SCRIPT CAPITAL P	# weierstrass
+2297	C4	# CIRCLED TIMES	# circlemultiply
+2295	C5	# CIRCLED PLUS	# circleplus
+2205	C6	# EMPTY SET	# emptyset
+2229	C7	# INTERSECTION	# intersection
+222A	C8	# UNION	# union
+2283	C9	# SUPERSET OF	# propersuperset
+2287	CA	# SUPERSET OF OR EQUAL TO	# reflexsuperset
+2284	CB	# NOT A SUBSET OF	# notsubset
+2282	CC	# SUBSET OF	# propersubset
+2286	CD	# SUBSET OF OR EQUAL TO	# reflexsubset
+2208	CE	# ELEMENT OF	# element
+2209	CF	# NOT AN ELEMENT OF	# notelement
+2220	D0	# ANGLE	# angle
+2207	D1	# NABLA	# gradient
+F6DA	D2	# REGISTERED SIGN SERIF	# registerserif (CUS)
+F6D9	D3	# COPYRIGHT SIGN SERIF	# copyrightserif (CUS)
+F6DB	D4	# TRADE MARK SIGN SERIF	# trademarkserif (CUS)
+220F	D5	# N-ARY PRODUCT	# product
+221A	D6	# SQUARE ROOT	# radical
+22C5	D7	# DOT OPERATOR	# dotmath
+00AC	D8	# NOT SIGN	# logicalnot
+2227	D9	# LOGICAL AND	# logicaland
+2228	DA	# LOGICAL OR	# logicalor
+21D4	DB	# LEFT RIGHT DOUBLE ARROW	# arrowdblboth
+21D0	DC	# LEFTWARDS DOUBLE ARROW	# arrowdblleft
+21D1	DD	# UPWARDS DOUBLE ARROW	# arrowdblup
+21D2	DE	# RIGHTWARDS DOUBLE ARROW	# arrowdblright
+21D3	DF	# DOWNWARDS DOUBLE ARROW	# arrowdbldown
+25CA	E0	# LOZENGE	# lozenge
+2329	E1	# LEFT-POINTING ANGLE BRACKET	# angleleft
+F8E8	E2	# REGISTERED SIGN SANS SERIF	# registersans (CUS)
+F8E9	E3	# COPYRIGHT SIGN SANS SERIF	# copyrightsans (CUS)
+F8EA	E4	# TRADE MARK SIGN SANS SERIF	# trademarksans (CUS)
+2211	E5	# N-ARY SUMMATION	# summation
+F8EB	E6	# LEFT PAREN TOP	# parenlefttp (CUS)
+F8EC	E7	# LEFT PAREN EXTENDER	# parenleftex (CUS)
+F8ED	E8	# LEFT PAREN BOTTOM	# parenleftbt (CUS)
+F8EE	E9	# LEFT SQUARE BRACKET TOP	# bracketlefttp (CUS)
+F8EF	EA	# LEFT SQUARE BRACKET EXTENDER	# bracketleftex (CUS)
+F8F0	EB	# LEFT SQUARE BRACKET BOTTOM	# bracketleftbt (CUS)
+F8F1	EC	# LEFT CURLY BRACKET TOP	# bracelefttp (CUS)
+F8F2	ED	# LEFT CURLY BRACKET MID	# braceleftmid (CUS)
+F8F3	EE	# LEFT CURLY BRACKET BOTTOM	# braceleftbt (CUS)
+F8F4	EF	# CURLY BRACKET EXTENDER	# braceex (CUS)
+232A	F1	# RIGHT-POINTING ANGLE BRACKET	# angleright
+222B	F2	# INTEGRAL	# integral
+2320	F3	# TOP HALF INTEGRAL	# integraltp
+F8F5	F4	# INTEGRAL EXTENDER	# integralex (CUS)
+2321	F5	# BOTTOM HALF INTEGRAL	# integralbt
+F8F6	F6	# RIGHT PAREN TOP	# parenrighttp (CUS)
+F8F7	F7	# RIGHT PAREN EXTENDER	# parenrightex (CUS)
+F8F8	F8	# RIGHT PAREN BOTTOM	# parenrightbt (CUS)
+F8F9	F9	# RIGHT SQUARE BRACKET TOP	# bracketrighttp (CUS)
+F8FA	FA	# RIGHT SQUARE BRACKET EXTENDER	# bracketrightex (CUS)
+F8FB	FB	# RIGHT SQUARE BRACKET BOTTOM	# bracketrightbt (CUS)
+F8FC	FC	# RIGHT CURLY BRACKET TOP	# bracerighttp (CUS)
+F8FD	FD	# RIGHT CURLY BRACKET MID	# bracerightmid (CUS)
+F8FE	FE	# RIGHT CURLY BRACKET BOTTOM	# bracerightbt (CUS)
diff --git a/pandoc.cabal b/pandoc.cabal
index c2e742faa..d11e6f659 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -327,6 +327,7 @@ Library
                    Text.Pandoc.Readers.Docx.Reducible,
                    Text.Pandoc.Readers.Docx.Parse,
                    Text.Pandoc.Readers.Docx.OMath,
+                   Text.Pandoc.Readers.Docx.Fonts
                    Text.Pandoc.Writers.Shared,
                    Text.Pandoc.Asciify,
                    Text.Pandoc.MIME,
diff --git a/src/Text/Pandoc/Readers/Docx/Fonts.hs b/src/Text/Pandoc/Readers/Docx/Fonts.hs
new file mode 100644
index 000000000..cd56eb115
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Docx/Fonts.hs
@@ -0,0 +1,237 @@
+{-
+Copyright (C) 2014 Matthew Pickering <matthewtpickering@gmail.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+-}
+
+{- |
+   Module      : Text.Pandoc.Readers.Docx.Fonts
+   Copyright   : Copyright (C) 2014 Matthew Pickering
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Matthew Pickering <matthewtpickering@gmail.com>
+   Stability   : alpha
+   Portability : portable
+
+Utilities to convert between font codepoints and unicode characters.
+-}
+module Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..)) where
+
+
+-- | Enumeration of recognised fonts
+data Font = Symbol -- ^ <http://en.wikipedia.org/wiki/Symbol_(typeface) Adobe Symbol>
+
+-- | Given a font and codepoint, returns the corresponding unicode
+-- character
+getUnicode :: Font -> Char -> Maybe Char
+getUnicode Symbol c = lookup c symbol
+
+-- Generated from lib/fonts/symbol.txt
+symbol :: [(Char, Char)]
+symbol =
+  [ (' ',' ')
+  , (' ','\160')
+  , ('!','!')
+  , ('"','\8704')
+  , ('#','#')
+  , ('$','\8707')
+  , ('%','%')
+  , ('&','&')
+  , ('\'','\8715')
+  , ('(','(')
+  , (')',')')
+  , ('*','\8727')
+  , ('+','+')
+  , (',',',')
+  , ('-','\8722')
+  , ('.','.')
+  , ('/','/')
+  , ('0','0')
+  , ('1','1')
+  , ('2','2')
+  , ('3','3')
+  , ('4','4')
+  , ('5','5')
+  , ('6','6')
+  , ('7','7')
+  , ('8','8')
+  , ('9','9')
+  , (':',':')
+  , (';',';')
+  , ('<','<')
+  , ('=','=')
+  , ('>','>')
+  , ('?','?')
+  , ('@','\8773')
+  , ('A','\913')
+  , ('B','\914')
+  , ('C','\935')
+  , ('D','\916')
+  , ('D','\8710')
+  , ('E','\917')
+  , ('F','\934')
+  , ('G','\915')
+  , ('H','\919')
+  , ('I','\921')
+  , ('J','\977')
+  , ('K','\922')
+  , ('L','\923')
+  , ('M','\924')
+  , ('N','\925')
+  , ('O','\927')
+  , ('P','\928')
+  , ('Q','\920')
+  , ('R','\929')
+  , ('S','\931')
+  , ('T','\932')
+  , ('U','\933')
+  , ('V','\962')
+  , ('W','\937')
+  , ('W','\8486')
+  , ('X','\926')
+  , ('Y','\936')
+  , ('Z','\918')
+  , ('[','[')
+  , ('\\','\8756')
+  , (']',']')
+  , ('^','\8869')
+  , ('_','_')
+  , ('`','\63717')
+  , ('a','\945')
+  , ('b','\946')
+  , ('c','\967')
+  , ('d','\948')
+  , ('e','\949')
+  , ('f','\966')
+  , ('g','\947')
+  , ('h','\951')
+  , ('i','\953')
+  , ('j','\981')
+  , ('k','\954')
+  , ('l','\955')
+  , ('m','\181')
+  , ('m','\956')
+  , ('n','\957')
+  , ('o','\959')
+  , ('p','\960')
+  , ('q','\952')
+  , ('r','\961')
+  , ('s','\963')
+  , ('t','\964')
+  , ('u','\965')
+  , ('v','\982')
+  , ('w','\969')
+  , ('x','\958')
+  , ('y','\968')
+  , ('z','\950')
+  , ('{','{')
+  , ('|','|')
+  , ('}','}')
+  , ('~','\8764')
+  , ('\160','\8364')
+  , ('\161','\978')
+  , ('\162','\8242')
+  , ('\163','\8804')
+  , ('\164','\8260')
+  , ('\164','\8725')
+  , ('\165','\8734')
+  , ('\166','\402')
+  , ('\167','\9827')
+  , ('\168','\9830')
+  , ('\169','\9829')
+  , ('\170','\9824')
+  , ('\171','\8596')
+  , ('\172','\8592')
+  , ('\173','\8593')
+  , ('\174','\8594')
+  , ('\175','\8595')
+  , ('\176','\176')
+  , ('\177','\177')
+  , ('\178','\8243')
+  , ('\179','\8805')
+  , ('\180','\215')
+  , ('\181','\8733')
+  , ('\182','\8706')
+  , ('\183','\8226')
+  , ('\184','\247')
+  , ('\185','\8800')
+  , ('\186','\8801')
+  , ('\187','\8776')
+  , ('\188','\8230')
+  , ('\189','\63718')
+  , ('\190','\63719')
+  , ('\191','\8629')
+  , ('\192','\8501')
+  , ('\193','\8465')
+  , ('\194','\8476')
+  , ('\195','\8472')
+  , ('\196','\8855')
+  , ('\197','\8853')
+  , ('\198','\8709')
+  , ('\199','\8745')
+  , ('\200','\8746')
+  , ('\201','\8835')
+  , ('\202','\8839')
+  , ('\203','\8836')
+  , ('\204','\8834')
+  , ('\205','\8838')
+  , ('\206','\8712')
+  , ('\207','\8713')
+  , ('\208','\8736')
+  , ('\209','\8711')
+  , ('\210','\63194')
+  , ('\211','\63193')
+  , ('\212','\63195')
+  , ('\213','\8719')
+  , ('\214','\8730')
+  , ('\215','\8901')
+  , ('\216','\172')
+  , ('\217','\8743')
+  , ('\218','\8744')
+  , ('\219','\8660')
+  , ('\220','\8656')
+  , ('\221','\8657')
+  , ('\222','\8658')
+  , ('\223','\8659')
+  , ('\224','\9674')
+  , ('\225','\9001')
+  , ('\226','\63720')
+  , ('\227','\63721')
+  , ('\228','\63722')
+  , ('\229','\8721')
+  , ('\230','\63723')
+  , ('\231','\63724')
+  , ('\232','\63725')
+  , ('\233','\63726')
+  , ('\234','\63727')
+  , ('\235','\63728')
+  , ('\236','\63729')
+  , ('\237','\63730')
+  , ('\238','\63731')
+  , ('\239','\63732')
+  , ('\241','\9002')
+  , ('\242','\8747')
+  , ('\243','\8992')
+  , ('\244','\63733')
+  , ('\245','\8993')
+  , ('\246','\63734')
+  , ('\247','\63735')
+  , ('\248','\63736')
+  , ('\249','\63737')
+  , ('\250','\63738')
+  , ('\251','\63739')
+  , ('\252','\63740')
+  , ('\253','\63741')
+  , ('\254','\63742')]
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
index beb58fed2..b12062407 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -59,10 +59,13 @@ import Data.Bits ((.|.))
 import qualified Data.ByteString.Lazy as B
 import qualified Text.Pandoc.UTF8 as UTF8
 import Control.Monad.Reader
+import Control.Applicative ((<$>))
 import qualified Data.Map as M
 import Text.Pandoc.Compat.Except
 import Text.Pandoc.Readers.Docx.OMath (readOMML)
+import Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..))
 import Text.TeXMath (Exp)
+import Data.Char (readLitChar)
 
 data ReaderEnv = ReaderEnv { envNotes         :: Notes
                            , envNumbering     :: Numbering
@@ -673,8 +676,27 @@ elemToRunElem ns element
     return $ TextRun $ strContent element
   | isElem ns "w" "br" element = return LnBrk
   | isElem ns "w" "tab" element = return Tab
+  | isElem ns "w" "sym" element = return (getSymChar ns element)
   | otherwise = throwError WrongElem
 
+-- The char attribute is a hex string
+getSymChar :: NameSpaces -> Element -> RunElem
+getSymChar ns element
+  | Just s <- lowerFromPrivate <$> getCodepoint
+  , Just font <- getFont =
+  let [(char, _)] = readLitChar ("\\x" ++ s) in
+    TextRun . maybe "" (:[]) $ getUnicode font char
+  where
+    getCodepoint = findAttr (elemName ns "w" "char") element
+    getFont = stringToFont =<< findAttr (elemName ns "w" "font") element
+    lowerFromPrivate ('F':xs) = '0':xs
+    lowerFromPrivate xs = xs
+getSymChar _ _ = TextRun ""
+
+stringToFont :: String -> Maybe Font
+stringToFont "Symbol" = Just Symbol
+stringToFont _ = Nothing
+
 elemToRunElems :: NameSpaces -> Element -> D [RunElem]
 elemToRunElems ns element
   |  isElem ns "w" "r" element
diff --git a/tests/docx.unicode.docx b/tests/docx.unicode.docx
index 78d0107a1..4360f6be7 100644
--- a/tests/docx.unicode.docx
+++ b/tests/docx.unicode.docx
diff --git a/tests/docx.unicode.native b/tests/docx.unicode.native
index e636355c7..f37bbb1a7 100644
--- a/tests/docx.unicode.native
+++ b/tests/docx.unicode.native
@@ -1 +1 @@
-[Para [Str "Hello,",Space,Str "\19990\30028.",Space,Str "This",Space,Str "costs",Space,Str "\8364\&10."]]
+[Para [Str "Hello,",Space,Str "\19990\30028.",Space,Str "This",Space,Str "costs",Space,Str "\8364\&10.\8744\8744"]]
author	John MacFarlane <jgm@berkeley.edu>	2014-08-09 20:22:06 -0700
committer	John MacFarlane <jgm@berkeley.edu>	2014-08-09 20:22:06 -0700
commit	eed1274fcaa0348fd08624eaa863ad26248958de (patch)
tree	6000aeb164abda25f1cce788e18e37e5180d7438
parent	c2a0d47c7b15483efa565ab2b6d9fa836e3a8818 (diff)
parent	ecb42cc002f892295ea33711947338580f60522b (diff)
download	pandoc-eed1274fcaa0348fd08624eaa863ad26248958de.tar.gz