summaryrefslogtreecommitdiff
path: root/lib/Hakyll/Core/Identifier/Pattern.hs
blob: 83d5adc2b68f305a35331458cdc6e96f228b38c0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
--------------------------------------------------------------------------------
-- | As 'Identifier' is used to specify a single item, a 'Pattern' is used to
-- specify a list of items.
--
-- In most cases, globs are used for patterns.
--
-- A very simple pattern of such a pattern is @\"foo\/bar\"@. This pattern will
-- only match the exact @foo\/bar@ identifier.
--
-- To match more than one identifier, there are different captures that one can
-- use:
--
-- * @\"*\"@: matches at most one element of an identifier;
--
-- * @\"**\"@: matches one or more elements of an identifier.
--
-- Some examples:
--
-- * @\"foo\/*\"@ will match @\"foo\/bar\"@ and @\"foo\/foo\"@, but not
--   @\"foo\/bar\/qux\"@;
--
-- * @\"**\"@ will match any identifier;
--
-- * @\"foo\/**\"@ will match @\"foo\/bar\"@ and @\"foo\/bar\/qux\"@, but not
--   @\"bar\/foo\"@;
--
-- * @\"foo\/*.html\"@ will match all HTML files in the @\"foo\/\"@ directory.
--
-- The 'capture' function allows the user to get access to the elements captured
-- by the capture elements in a glob or regex pattern.
module Hakyll.Core.Identifier.Pattern
    ( -- * The pattern type
      Pattern

      -- * Creating patterns
    , fromGlob
    , fromList
    , fromRegex
    , fromVersion
    , hasVersion
    , hasNoVersion

      -- * Composing patterns
    , (.&&.)
    , (.||.)
    , complement

      -- * Applying patterns
    , matches
    , filterMatches

      -- * Capturing strings
    , capture
    , fromCapture
    , fromCaptures
    ) where


--------------------------------------------------------------------------------
import           Control.Arrow          ((&&&), (>>>))
import           Control.Monad          (msum)
import           Data.Binary            (Binary (..), getWord8, putWord8)
import           Data.List              (inits, isPrefixOf, tails)
import           Data.Maybe             (isJust)
import           Data.Set               (Set)
import qualified Data.Set               as S


--------------------------------------------------------------------------------
import           GHC.Exts               (IsString, fromString)
import           Text.Regex.TDFA        ((=~))


--------------------------------------------------------------------------------
import           Hakyll.Core.Identifier


--------------------------------------------------------------------------------
-- | Elements of a glob pattern
data GlobComponent
    = Capture
    | CaptureMany
    | Literal String
    deriving (Eq, Show)


--------------------------------------------------------------------------------
instance Binary GlobComponent where
    put Capture     = putWord8 0
    put CaptureMany = putWord8 1
    put (Literal s) = putWord8 2 >> put s

    get = getWord8 >>= \t -> case t of
        0 -> pure Capture
        1 -> pure CaptureMany
        2 -> Literal <$> get
        _ -> error "Data.Binary.get: Invalid GlobComponent"


--------------------------------------------------------------------------------
-- | Type that allows matching on identifiers
data Pattern
    = Everything
    | Complement Pattern
    | And Pattern Pattern
    | Glob [GlobComponent]
    | List (Set Identifier)
    | Regex String
    | Version (Maybe String)
    deriving (Show)


--------------------------------------------------------------------------------
instance Binary Pattern where
    put Everything     = putWord8 0
    put (Complement p) = putWord8 1 >> put p
    put (And x y)      = putWord8 2 >> put x >> put y
    put (Glob g)       = putWord8 3 >> put g
    put (List is)      = putWord8 4 >> put is
    put (Regex r)      = putWord8 5 >> put r
    put (Version v)    = putWord8 6 >> put v

    get = getWord8 >>= \t -> case t of
        0 -> pure Everything
        1 -> Complement <$> get
        2 -> And <$> get <*> get
        3 -> Glob <$> get
        4 -> List <$> get
        5 -> Regex <$> get
        _ -> Version <$> get


--------------------------------------------------------------------------------
instance IsString Pattern where
    fromString = fromGlob


--------------------------------------------------------------------------------
instance Monoid Pattern where
    mempty  = Everything
    mappend = (.&&.)


--------------------------------------------------------------------------------
-- | Parse a pattern from a string
fromGlob :: String -> Pattern
fromGlob = Glob . parse'
  where
    parse' str =
        let (chunk, rest) = break (`elem` "\\*") str
        in case rest of
            ('\\' : x   : xs) -> Literal (chunk ++ [x]) : parse' xs
            ('*'  : '*' : xs) -> Literal chunk : CaptureMany : parse' xs
            ('*'  : xs)       -> Literal chunk : Capture : parse' xs
            xs                -> Literal chunk : Literal xs : []


--------------------------------------------------------------------------------
-- | Create a 'Pattern' from a list of 'Identifier's it should match.
--
-- /Warning/: use this carefully with 'hasNoVersion' and 'hasVersion'. The
-- 'Identifier's in the list /already/ have versions assigned, and the pattern
-- will then only match the intersection of both versions.
--
-- A more concrete example,
--
-- > fromList ["foo.markdown"] .&&. hasVersion "pdf"
--
-- will not match anything! The @"foo.markdown"@ 'Identifier' has no version
-- assigned, so the LHS of '.&&.' will only match this 'Identifier' with no
-- version. The RHS only matches 'Identifier's with version set to @"pdf"@ --
-- hence, this pattern matches nothing.
--
-- The correct way to use this is:
--
-- > fromList $ map (setVersion $ Just "pdf") ["foo.markdown"]
fromList :: [Identifier] -> Pattern
fromList = List . S.fromList


--------------------------------------------------------------------------------
-- | Create a 'Pattern' from a regex
--
-- Example:
--
-- > regex "^foo/[^x]*$
fromRegex :: String -> Pattern
fromRegex = Regex


--------------------------------------------------------------------------------
-- | Create a pattern which matches all items with the given version.
fromVersion :: Maybe String -> Pattern
fromVersion = Version


--------------------------------------------------------------------------------
-- | Specify a version, e.g.
--
-- > "foo/*.markdown" .&&. hasVersion "pdf"
hasVersion :: String -> Pattern
hasVersion = fromVersion . Just


--------------------------------------------------------------------------------
-- | Match only if the identifier has no version set, e.g.
--
-- > "foo/*.markdown" .&&. hasNoVersion
hasNoVersion :: Pattern
hasNoVersion = fromVersion Nothing


--------------------------------------------------------------------------------
-- | '&&' for patterns: the given identifier must match both subterms
(.&&.) :: Pattern -> Pattern -> Pattern
x .&&. y = And x y
infixr 3 .&&.


--------------------------------------------------------------------------------
-- | '||' for patterns: the given identifier must match any subterm
(.||.) :: Pattern -> Pattern -> Pattern
x .||. y = complement (complement x `And` complement y)  -- De Morgan's law
infixr 2 .||.


--------------------------------------------------------------------------------
-- | Inverts a pattern, e.g.
--
-- > complement "foo/bar.html"
--
-- will match /anything/ except @\"foo\/bar.html\"@
complement :: Pattern -> Pattern
complement = Complement


--------------------------------------------------------------------------------
-- | Check if an identifier matches a pattern
matches :: Pattern -> Identifier -> Bool
matches Everything     _ = True
matches (Complement p) i = not $ matches p i
matches (And x y)      i = matches x i && matches y i
matches (Glob p)       i = isJust $ capture (Glob p) i
matches (List l)       i = i `S.member` l
matches (Regex r)      i = toFilePath i =~ r
matches (Version v)    i = identifierVersion i == v


--------------------------------------------------------------------------------
-- | Given a list of identifiers, retain only those who match the given pattern
filterMatches :: Pattern -> [Identifier] -> [Identifier]
filterMatches = filter . matches


--------------------------------------------------------------------------------
-- | Split a list at every possible point, generate a list of (init, tail)
-- cases. The result is sorted with inits decreasing in length.
splits :: [a] -> [([a], [a])]
splits = inits &&& tails >>> uncurry zip >>> reverse


--------------------------------------------------------------------------------
-- | Match a glob or regex pattern against an identifier, generating a list of captures
capture :: Pattern -> Identifier -> Maybe [String]
capture (Glob p) i = capture' p (toFilePath i)
capture (Regex pat) i = Just groups
  where (_, _, _, groups) = ((toFilePath i) =~ pat) :: (String, String, String, [String])
capture _        _ = Nothing


--------------------------------------------------------------------------------
-- | Internal verion of 'capture'
capture' :: [GlobComponent] -> String -> Maybe [String]
capture' [] [] = Just []  -- An empty match
capture' [] _  = Nothing  -- No match
capture' (Literal l : ms) str
    -- Match the literal against the string
    | l `isPrefixOf` str = capture' ms $ drop (length l) str
    | otherwise          = Nothing
capture' (Capture : ms) str =
    -- Match until the next /
    let (chunk, rest) = break (== '/') str
    in msum $ [ fmap (i :) (capture' ms (t ++ rest)) | (i, t) <- splits chunk ]
capture' (CaptureMany : ms) str =
    -- Match everything
    msum $ [ fmap (i :) (capture' ms t) | (i, t) <- splits str ]


--------------------------------------------------------------------------------
-- | Create an identifier from a pattern by filling in the captures with a given
-- string
--
-- Example:
--
-- > fromCapture (fromGlob "tags/*") "foo"
--
-- Result:
--
-- > "tags/foo"
fromCapture :: Pattern -> String -> Identifier
fromCapture pattern = fromCaptures pattern . repeat


--------------------------------------------------------------------------------
-- | Create an identifier from a pattern by filling in the captures with the
-- given list of strings
fromCaptures :: Pattern -> [String] -> Identifier
fromCaptures (Glob p) = fromFilePath . fromCaptures' p
fromCaptures _        = error $
    "Hakyll.Core.Identifier.Pattern.fromCaptures: fromCaptures only works " ++
    "on simple globs!"


--------------------------------------------------------------------------------
-- | Internally used version of 'fromCaptures'
fromCaptures' :: [GlobComponent] -> [String] -> String
fromCaptures' []        _ = mempty
fromCaptures' (m : ms) [] = case m of
    Literal l -> l `mappend` fromCaptures' ms []
    _         -> error $  "Hakyll.Core.Identifier.Pattern.fromCaptures': "
                       ++ "identifier list exhausted"
fromCaptures' (m : ms) ids@(i : is) = case m of
    Literal l -> l `mappend` fromCaptures' ms ids
    _         -> i `mappend` fromCaptures' ms is