summaryrefslogtreecommitdiff
path: root/app/XMLParse.hs
blob: 58e750d2ab827209882173332293c0f25a5a47de (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
{-# OPTIONS -cpp #-}

-- A copy of Text.XML.HaXml.Parse from HaXml-1.25.5, with seemingly obvious
-- additions to the export list.
--
-- (c) copyright 1998-1999    Malcolm Wallace and Colin Runciman
-- (c) copyright 2000-2013    Malcolm Wallace
-- (c) copyright 2021         Software Freedom Law Center
-- Licensed under the terms of the GNU Lesser General Public Licence (LGPL)
-- version 2.1.
--
-- As a relaxation of clause 6 of the LGPL, the copyright holders of this
-- library give permission to use, copy, link, modify, and distribute,
-- binary-only object-code versions of an executable linked with the original
-- unmodified Library, without requiring the supply of any mechanism to modify
-- or replace the Library and relink (clauses 6a, 6b, 6c, 6d, 6e), provided
-- that all the other terms of clause 6 are complied with.

module XMLParse
  (
  -- * Parse a whole document
    xmlParse, xmlParse'
  -- * Parse just a DTD
  , dtdParse, dtdParse'
  -- * Parse a partial document
  , xmlParseWith
  -- * Individual parsers for use with /xmlParseWith/ and module SAX
  , document, element, content
  , comment, cdsect, chardata
  , reference, doctypedecl
  , processinginstruction
  , elemtag, qname, name, tok
  , elemOpenTag, elemCloseTag
  , emptySTs, XParser
  , prolog
  -- * These general utility functions don't belong here
  , fst3, snd3, thd3
  ) where

-- An XML parser, written using a slightly extended version of the
-- Hutton/Meijer parser combinators.  The input is tokenised internally
-- by the lexer xmlLex.  Whilst parsing, we gather a symbol
-- table of entity references.  PERefs must be defined before use, so we
-- expand their uses as we encounter them, forcing the remainder of the
-- input to be re-lexed and re-parsed.  GERefs are simply stored for
-- later retrieval.

import Prelude hiding (either,maybe,sequence)
import qualified Prelude (either)
import Data.Maybe hiding (maybe)
import Data.List (intersperse)       -- debugging only
import Data.Char (isSpace,isDigit,isHexDigit)
import Control.Monad hiding (sequence)
import Numeric (readDec,readHex)

import Text.XML.HaXml.Types
import Text.XML.HaXml.Namespaces
import Text.XML.HaXml.Posn
import Text.XML.HaXml.Lex
import Text.ParserCombinators.Poly.State

import System.FilePath (combine, dropFileName)


#if ( defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ > 502 ) || \
    ( defined(__NHC__) && __NHC__ > 114 ) || defined(__HUGS__)
import System.IO.Unsafe (unsafePerformIO)
#elif defined(__GLASGOW_HASKELL__)
import IOExts (unsafePerformIO)
#elif defined(__NHC__)
import IOExtras (unsafePerformIO)
#elif defined(__HBC__)
import UnsafePerformIO
#endif

--  #define DEBUG

#if defined(DEBUG)
#  if ( defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ > 502 ) || \
      ( defined(__NHC__) && __NHC__ > 114 ) || defined(__HUGS__)
import Debug.Trace(trace)
#  elif defined(__GLASGOW_HASKELL__)
import IOExts(trace)
#  elif defined(__NHC__) || defined(__HBC__)
import NonStdTrace
#  endif
v `debug` s = trace s v
#else
v `debug` s = v
#endif
debug :: a -> String -> a


-- | To parse a whole document, @xmlParse file content@ takes a filename
--   (for generating error reports) and the string content of that file.
--   A parse error causes program failure, with message to stderr.
xmlParse :: String -> String -> Document Posn

-- | To parse a whole document, @xmlParse' file content@ takes a filename
--   (for generating error reports) and the string content of that file.
--   Any parse error message is passed back to the caller through the
--   @Either@ type.
xmlParse' :: String -> String -> Either String (Document Posn)

-- | To parse just a DTD, @dtdParse file content@ takes a filename
--   (for generating error reports) and the string content of that
--   file.  If no DTD was found, you get @Nothing@ rather than an error.
--   However, if a DTD is found but contains errors, the program crashes.
dtdParse  :: String -> String -> Maybe DocTypeDecl

-- | To parse just a DTD, @dtdParse' file content@ takes a filename
--   (for generating error reports) and the string content of that
--   file.  If no DTD was found, you get @Right Nothing@.
--   If a DTD was found but contains errors, you get a @Left message@.
dtdParse' :: String -> String -> Either String (Maybe DocTypeDecl)

xmlParse  name  = Prelude.either error id . xmlParse' name
dtdParse  name  = Prelude.either error id . dtdParse' name

xmlParse' name  = fst3 . runParser (toEOF document) emptySTs . xmlLex name
dtdParse' name  = fst3 . runParser justDTD  emptySTs . xmlLex name

toEOF :: XParser a -> XParser a
toEOF = id	-- there are other possible implementations...

-- | To parse a partial document, e.g. from an XML-based stream protocol,
--   where you may later want to get more document elements from the same
--   stream.  Arguments are: a parser for the item you want, and the
--   already-lexed input to parse from.  Returns the item you wanted
--   (or an error message), plus the remainder of the input.
xmlParseWith :: XParser a -> [(Posn,TokenT)]
                -> (Either String a, [(Posn,TokenT)])
xmlParseWith p = (\(v,_,s)->(v,s)) . runParser p emptySTs


---- Symbol table stuff ----

type SymTabs = (SymTab PEDef, SymTab EntityDef)

-- | Some empty symbol tables for GE and PE references.
emptySTs :: SymTabs
emptySTs = (emptyST, emptyST)

addPE :: String -> PEDef -> SymTabs -> SymTabs
addPE n v (pe,ge) = (addST n v pe, ge)

addGE :: String -> EntityDef -> SymTabs -> SymTabs
addGE n v (pe,ge) = let newge = addST n v ge in newge `seq` (pe, newge)

lookupPE :: String -> SymTabs -> Maybe PEDef
lookupPE s (pe,_ge) = lookupST s pe

flattenEV :: EntityValue -> String
flattenEV (EntityValue evs) = concatMap flatten evs
  where
    flatten (EVString s)          = s
    flatten (EVRef (RefEntity r)) = "&" ++r++";"
    flatten (EVRef (RefChar r))   = "&#"++show r++";"
 -- flatten (EVPERef n)           = "%" ++n++";"


---- Misc ----
fst3 :: (a,b,c) -> a
snd3 :: (a,b,c) -> b
thd3 :: (a,b,c) -> c

fst3 (a,_,_) = a
snd3 (_,a,_) = a
thd3 (_,_,a) = a


---- Auxiliary Parsing Functions ----

-- | XParser is just a specialisation of the PolyState parser.
type XParser a = Parser SymTabs (Posn,TokenT) a

-- | Return the next token from the input only if it matches the given token.
tok :: TokenT -> XParser TokenT
tok t = do (p,t') <- next
           case t' of TokError _    -> report failBad (show t) p t'
                      _ | t'==t     -> return t
                        | otherwise -> report fail (show t) p t'
nottok :: [TokenT] -> XParser TokenT
nottok ts = do (p,t) <- next
               if t`elem`ts then report fail ("no "++show t) p t
                            else return t

-- | Return a qualified name (although the namespace qualification is not
--   processed here; this is merely to get the correct type).
qname :: XParser QName
qname = fmap N name

-- | Return just a name, e.g. element name, attribute name.
name :: XParser Name
name = do (p,tok) <- next
          case tok of
            TokName s  -> return s
            TokError _ -> report failBad "a name" p tok
            _          -> report fail "a name" p tok

string, freetext :: XParser String
string   = do (p,t) <- next
              case t of TokName s -> return s
                        _         -> report fail "text" p t
freetext = do (p,t) <- next
              case t of TokFreeText s -> return s
                        _             -> report fail "text" p t

maybe :: XParser a -> XParser (Maybe a)
maybe p =
    ( p >>= return . Just) `onFail`
    ( return Nothing)

either :: XParser a -> XParser b -> XParser (Either a b)
either p q =
    ( p >>= return . Left) `onFail`
    ( q >>= return . Right)

word :: String -> XParser ()
word s = do { x <- next
            ; case x of
                (_p,TokName n)     | s==n -> return ()
                (_p,TokFreeText n) | s==n -> return ()
                ( p,t@(TokError _)) -> report failBad (show s) p t
                ( p,t) -> report fail (show s) p t
            }

posn :: XParser Posn
posn = do { x@(p,_) <- next
          ; reparse [x]
          ; return p
          }

nmtoken :: XParser NmToken
nmtoken = (string `onFail` freetext)

failP, failBadP :: String -> XParser a
failP msg = do { p <- posn; fail (msg++"\n    at "++show p) }
failBadP msg = do { p <- posn; failBad (msg++"\n    at "++show p) }

report :: (String->XParser a) -> String -> Posn -> TokenT -> XParser a
report fail expect p t = fail ("Expected "++expect++" but found "++show t
                               ++"\n  in "++show p)

adjustErrP :: XParser a -> (String->String) -> XParser a
p `adjustErrP` f = p `onFail` do pn <- posn
                                 (p `adjustErr` f) `adjustErr` (++show pn)

peRef :: XParser a -> XParser a
peRef p =
    p `onFail`
    do pn <- posn
       n <- pereference
       tr <- stQuery (lookupPE n) `debug` ("Looking up %"++n)
       case tr of
           Just (PEDefEntityValue ev) ->
                      do reparse (xmlReLex (posInNewCxt ("macro %"++n++";")
                                                        (Just pn))
                                           (flattenEV ev))
                               `debug` ("  defn:  "++flattenEV ev)
                         peRef p
           Just (PEDefExternalID (PUBLIC _ (SystemLiteral f))) ->
                      do let f' = combine (dropFileName $ posnFilename pn) f
                             val = unsafePerformIO (readFile f')
                         reparse (xmlReLex (posInNewCxt f'
                                                        (Just pn)) val)
                               `debug` ("  reading from file "++f')
                         peRef p
           Just (PEDefExternalID (SYSTEM (SystemLiteral f))) ->
                      do let f' = combine (dropFileName $ posnFilename pn) f
                             val = unsafePerformIO (readFile f')
                         reparse (xmlReLex (posInNewCxt f'
                                                        (Just pn)) val)
                               `debug` ("  reading from file "++f')
                         peRef p
           Nothing -> fail ("PEReference use before definition: "++"%"++n++";"
                           ++"\n    at "++show pn)

blank :: XParser a -> XParser a
blank p =
    p `onFail`
    do n <- pereference
       tr <- stQuery (lookupPE n) `debug` ("Looking up %"++n++" (is blank?)")
       case tr of
           Just (PEDefEntityValue ev)
                    | all isSpace (flattenEV ev)  ->
                            do blank p `debug` "Empty macro definition"
           Just _  -> failP ("expected a blank PERef macro: "++"%"++n++";")
           Nothing -> failP ("PEReference use before definition: "++"%"++n++";")



---- XML Parsing Functions ----

justDTD :: XParser (Maybe DocTypeDecl)
justDTD =
  do (ExtSubset _ ds) <- extsubset `debug` "Trying external subset"
     if null ds then fail "empty"
         else return (Just (DTD (N "extsubset") Nothing (concatMap extract ds)))
  `onFail`
  do (Prolog _ _ dtd _) <- prolog
     return dtd
 where extract (ExtMarkupDecl m) = [m]
       extract (ExtConditionalSect (IncludeSect i)) = concatMap extract i
       extract (ExtConditionalSect (IgnoreSect _i)) = []

-- | Return an entire XML document including prolog and trailing junk.
document :: XParser (Document Posn)
document = do
    p <- prolog `adjustErr` ("unrecognisable XML prolog\n"++)
    e <- element
    ms <- many misc
    (_,ge) <- stGet
    return (Document p ge e ms)

-- | Return an XML comment.
comment :: XParser Comment
comment = do
    bracket (tok TokCommentOpen) (tok TokCommentClose) freetext
--  tok TokCommentOpen
--  commit $ do
--    c <- freetext
--    tok TokCommentClose
--    return c

-- | Parse a processing instruction.
processinginstruction :: XParser ProcessingInstruction
processinginstruction = do
    tok TokPIOpen
    commit $ do
      n <- string  `onFail` failP "processing instruction has no target"
      f <- freetext
      tok TokPIClose `onFail` failP ("missing ?> in <?"++n)
      return (n, f)

cdsect :: XParser CDSect
cdsect = do
    tok TokSectionOpen
    bracket (tok (TokSection CDATAx)) (commit $ tok TokSectionClose) chardata

prolog :: XParser Prolog
prolog = do
    x   <- maybe xmldecl
    m1  <- many misc
    dtd <- maybe doctypedecl
    m2  <- many misc
    return (Prolog x m1 dtd m2)

xmldecl :: XParser XMLDecl
xmldecl = do
    tok TokPIOpen
    (word "xml" `onFail` word "XML")
    p <- posn
    s <- freetext
    tok TokPIClose `onFail` failBadP "missing ?> in <?xml ...?>"
    raise ((runParser aux emptySTs . xmlReLex p) s)
  where
    aux = do
        v <- versioninfo  `onFail` failP "missing XML version info"
        e <- maybe encodingdecl
        s <- maybe sddecl
        return (XMLDecl v e s)
    raise (Left err, _, _) = failP err
    raise (Right ok, _, _) = return ok

versioninfo :: XParser VersionInfo
versioninfo = do
    (word "version" `onFail` word "VERSION")
    tok TokEqual
    bracket (tok TokQuote) (commit $ tok TokQuote) freetext

misc :: XParser Misc
misc =
    oneOf' [ ("<!--comment-->",  comment >>= return . Comment)
           , ("<?PI?>",          processinginstruction >>= return . PI)
           ]

-- | Return a DOCTYPE decl, indicating a DTD.
doctypedecl :: XParser DocTypeDecl
doctypedecl = do
    tok TokSpecialOpen
    tok (TokSpecial DOCTYPEx)
    commit $ do
      n   <- qname
      eid <- maybe externalid
      es  <- maybe (bracket (tok TokSqOpen) (commit $ tok TokSqClose)
                            (many (peRef markupdecl)))
      blank (tok TokAnyClose)  `onFail` failP "missing > in DOCTYPE decl"
      return (DTD n eid (case es of { Nothing -> []; Just e -> e }))

-- | Return a DTD markup decl, e.g. ELEMENT, ATTLIST, etc
markupdecl :: XParser MarkupDecl
markupdecl =
  oneOf' [ ("ELEMENT",  elementdecl  >>= return . Element)
         , ("ATTLIST",  attlistdecl  >>= return . AttList)
         , ("ENTITY",   entitydecl   >>= return . Entity)
         , ("NOTATION", notationdecl >>= return . Notation)
         , ("misc",     misc         >>= return . MarkupMisc)
         ]
    `adjustErrP`
          ("when looking for a markup decl,\n"++)
 --       (\  (ELEMENT, ATTLIST, ENTITY, NOTATION, <!--comment-->, or <?PI?>")

extsubset :: XParser ExtSubset
extsubset = do
    td <- maybe textdecl
    ds <- many (peRef extsubsetdecl)
    return (ExtSubset td ds)

extsubsetdecl :: XParser ExtSubsetDecl
extsubsetdecl =
    ( markupdecl >>= return . ExtMarkupDecl) `onFail`
    ( conditionalsect >>= return . ExtConditionalSect)

sddecl :: XParser SDDecl
sddecl = do
    (word "standalone" `onFail` word "STANDALONE")
    commit $ do
      tok TokEqual `onFail` failP "missing = in 'standalone' decl"
      bracket (tok TokQuote) (commit $ tok TokQuote)
              ( (word "yes" >> return True) `onFail`
                (word "no" >> return False) `onFail`
                failP "'standalone' decl requires 'yes' or 'no' value" )

{-
element :: XParser (Element Posn)
element = do
    tok TokAnyOpen
    (ElemTag n as) <- elemtag
    oneOf' [ ("self-closing tag <"++n++"/>"
             ,  do tok TokEndClose
                   return (Elem n as []))
           , ("after open tag <"++n++">"
             ,  do tok TokAnyClose
                   cs <- many content
                   p  <- posn
                   m  <- bracket (tok TokEndOpen) (commit $ tok TokAnyClose) qname
                   checkmatch p n m
                   return (Elem n as cs))
           ] `adjustErr` (("in element tag "++n++",\n")++)
-}

-- | Return a complete element including all its inner content.
element :: XParser (Element Posn)
element = do
    tok TokAnyOpen
    (ElemTag n as) <- elemtag
    ( do tok TokEndClose
         commit (return (Elem n as []))
        `onFail`
      do tok TokAnyClose
         commit $ do
           return (Elem n as) `apply`
                 manyFinally content
                             (do p <- posn
                                 m <- bracket (tok TokEndOpen)
                                              (commit $ tok TokAnyClose) qname
                                 checkmatch p n m)
      ) `adjustErrBad` (("in element tag "++printableName n++",\n")++)

checkmatch :: Posn -> QName -> QName -> XParser ()
checkmatch p n m =
  if n == m then return ()
  else failBad ("tag <"++printableName n++"> terminated by </"++printableName m
                ++">\n  at "++show p)

-- | Parse only the parts between angle brackets in an element tag.
elemtag :: XParser ElemTag
elemtag = do
    n  <- qname `adjustErrBad` ("malformed element tag\n"++)
    as <- many attribute
    return (ElemTag n as)

-- | For use with stream parsers - returns the complete opening element tag.
elemOpenTag :: XParser ElemTag
elemOpenTag = do
    tok TokAnyOpen
    e <- elemtag
    tok TokAnyClose
    return e

-- | For use with stream parsers - accepts a closing tag, provided it
--   matches the given element name.
elemCloseTag :: QName -> XParser ()
elemCloseTag n = do
    tok TokEndOpen
    p <- posn
    m <- qname
    tok TokAnyClose
    checkmatch p n m

attribute :: XParser Attribute
attribute = do
    n <- qname `adjustErr` ("malformed attribute name\n"++)
    tok TokEqual `onFail` failBadP "missing = in attribute"
    v <- attvalue `onFail` failBadP "missing attvalue"
    return (n,v)

-- | Return a content particle, e.g. text, element, reference, etc
content :: XParser (Content Posn)
content =
  do { p  <- posn
     ; c' <- content'
     ; return (c' p)
     }
  where
     content' = oneOf' [ ("element",   element   >>= return . CElem)
                       , ("chardata",  chardata  >>= return . CString False)
                       , ("reference", reference >>= return . CRef)
                       , ("CDATA",     cdsect    >>= return . CString True)
                       , ("misc",      misc      >>= return . CMisc)
                       ]
                  `adjustErrP` ("when looking for a content item,\n"++)
-- (\    (element, text, reference, CDATA section, <!--comment-->, or <?PI?>")

elementdecl :: XParser ElementDecl
elementdecl = do
    tok TokSpecialOpen
    tok (TokSpecial ELEMENTx)
    n <- peRef qname `adjustErrBad` ("expecting identifier in ELEMENT decl\n"++)
    c <- peRef contentspec
             `adjustErrBad` (("in content spec of ELEMENT decl: "
                              ++printableName n++"\n")++)
    blank (tok TokAnyClose) `onFail` failBadP
       ("expected > terminating ELEMENT decl"
       ++"\n    element name was "++show (printableName n)
       ++"\n    contentspec was "++(\ (ContentSpec p)-> debugShowCP p) c)
    return (ElementDecl n c)

contentspec :: XParser ContentSpec
contentspec =
    oneOf' [ ("EMPTY",  peRef (word "EMPTY") >> return EMPTY)
           , ("ANY",    peRef (word "ANY") >> return ANY)
           , ("mixed",  peRef mixed >>= return . Mixed)
           , ("simple", peRef cp >>= return . ContentSpec)
           ]
 --   `adjustErr` ("when looking for content spec,\n"++)
 --   `adjustErr` (++"\nLooking for content spec (EMPTY, ANY, mixed, etc)")

choice :: XParser [CP]
choice = do
    bracket (tok TokBraOpen `debug` "Trying choice")
            (blank (tok TokBraClose `debug` "Succeeded with choice"))
            (peRef cp `sepBy1` blank (tok TokPipe))

sequence :: XParser [CP]
sequence = do
    bracket (tok TokBraOpen `debug` "Trying sequence")
            (blank (tok TokBraClose `debug` "Succeeded with sequence"))
            (peRef cp `sepBy1` blank (tok TokComma))

cp :: XParser CP
cp = oneOf [ ( do n <- qname
                  m <- modifier
                  let c = TagName n m
                  return c `debug` ("ContentSpec: name "++debugShowCP c))
           , ( do ss <- sequence
                  m <- modifier
                  let c = Seq ss m
                  return c `debug` ("ContentSpec: sequence "++debugShowCP c))
           , ( do cs <- choice
                  m <- modifier
                  let c = Choice cs m
                  return c `debug` ("ContentSpec: choice "++debugShowCP c))
           ] `adjustErr` (++"\nwhen looking for a content particle")

modifier :: XParser Modifier
modifier = oneOf [ ( tok TokStar >> return Star )
                 , ( tok TokQuery >> return Query )
                 , ( tok TokPlus >> return Plus )
                 , ( return None )
                 ]

-- just for debugging
debugShowCP :: CP -> String
debugShowCP cp = case cp of
    TagName n m  -> printableName n++debugShowModifier m
    Choice cps m -> '(': concat (intersperse "|" (map debugShowCP cps))++")"++debugShowModifier m
    Seq cps m    -> '(': concat (intersperse "," (map debugShowCP cps))++")"++debugShowModifier m
debugShowModifier :: Modifier -> String
debugShowModifier modifier = case modifier of
    None  -> ""
    Query -> "?"
    Star  -> "*"
    Plus  -> "+"
----

mixed :: XParser Mixed
mixed = do
    tok TokBraOpen
    peRef (do tok TokHash
              word "PCDATA")
    commit $
      oneOf [ ( do cs <- many (peRef (do tok TokPipe
                                         peRef qname))
                   blank (tok TokBraClose >> tok TokStar)
                   return (PCDATAplus cs))
            , ( blank (tok TokBraClose >> tok TokStar) >> return PCDATA)
            , ( blank (tok TokBraClose) >> return PCDATA)
            ]
        `adjustErrP` (++"\nLooking for mixed content spec (#PCDATA | ...)*\n")

attlistdecl :: XParser AttListDecl
attlistdecl = do
    tok TokSpecialOpen
    tok (TokSpecial ATTLISTx)
    n <- peRef qname `adjustErrBad` ("expecting identifier in ATTLIST\n"++)
    ds <- peRef (many1 (peRef attdef))
    blank (tok TokAnyClose) `onFail` failBadP "missing > terminating ATTLIST"
    return (AttListDecl n ds)

attdef :: XParser AttDef
attdef =
  do n <- peRef qname `adjustErr` ("expecting attribute name\n"++)
     t <- peRef atttype `adjustErr` (("within attlist defn: "
                                     ++printableName n++",\n")++)
     d <- peRef defaultdecl `adjustErr` (("in attlist defn: "
                                         ++printableName n++",\n")++)
     return (AttDef n t d)

atttype :: XParser AttType
atttype =
    oneOf' [ ("CDATA",      word "CDATA" >> return StringType)
           , ("tokenized",  tokenizedtype >>= return . TokenizedType)
           , ("enumerated", enumeratedtype >>= return . EnumeratedType)
           ]
      `adjustErr` ("looking for ATTTYPE,\n"++)
 --   `adjustErr` (++"\nLooking for ATTTYPE (CDATA, tokenized, or enumerated")

tokenizedtype :: XParser TokenizedType
tokenizedtype =
    oneOf [ ( word "ID" >> return ID)
          , ( word "IDREF" >> return IDREF)
          , ( word "IDREFS" >> return IDREFS)
          , ( word "ENTITY" >> return ENTITY)
          , ( word "ENTITIES" >> return ENTITIES)
          , ( word "NMTOKEN" >> return NMTOKEN)
          , ( word "NMTOKENS" >> return NMTOKENS)
          ] `onFail`
    do { t <- next
       ; failP ("Expected one of"
               ++" (ID, IDREF, IDREFS, ENTITY, ENTITIES, NMTOKEN, NMTOKENS)"
               ++"\nbut got "++show t)
       }

enumeratedtype :: XParser EnumeratedType
enumeratedtype =
    oneOf' [ ("NOTATION",   notationtype >>= return . NotationType)
           , ("enumerated", enumeration >>= return . Enumeration)
           ]
      `adjustErr` ("looking for an enumerated or NOTATION type,\n"++)

notationtype :: XParser NotationType
notationtype = do
    word "NOTATION"
    bracket (tok TokBraOpen) (commit $ blank $ tok TokBraClose)
            (peRef name `sepBy1` peRef (tok TokPipe))

enumeration :: XParser Enumeration
enumeration =
    bracket (tok TokBraOpen) (commit $ blank $ tok TokBraClose)
            (peRef nmtoken `sepBy1` blank (peRef (tok TokPipe)))

defaultdecl :: XParser DefaultDecl
defaultdecl =
    oneOf' [ ("REQUIRED",  tok TokHash >> word "REQUIRED" >> return REQUIRED)
           , ("IMPLIED",   tok TokHash >> word "IMPLIED" >> return IMPLIED)
           , ("FIXED",     do f <- maybe (tok TokHash >> word "FIXED"
                                                      >> return FIXED)
                              a <- peRef attvalue
                              return (DefaultTo a f) )
           ]
        `adjustErr` ("looking for an attribute default decl,\n"++)

conditionalsect :: XParser ConditionalSect
conditionalsect = oneOf'
    [ ( "INCLUDE"
      , do tok TokSectionOpen
           peRef (tok (TokSection INCLUDEx))
           p <- posn
           tok TokSqOpen `onFail` failBadP "missing [ after INCLUDE"
           i <- many (peRef extsubsetdecl)
           tok TokSectionClose
                   `onFail` failBadP ("missing ]]> for INCLUDE section"
                                     ++"\n    begun at "++show p)
           return (IncludeSect i))
    , ( "IGNORE"
      , do tok TokSectionOpen
           peRef (tok (TokSection IGNOREx))
           p <- posn
           tok TokSqOpen `onFail` failBadP "missing [ after IGNORE"
           many newIgnore  -- many ignoresectcontents
           tok TokSectionClose
                   `onFail` failBadP ("missing ]]> for IGNORE section"
                                     ++"\n    begun at "++show p)
           return (IgnoreSect []))
    ] `adjustErr` ("in a conditional section,\n"++)

newIgnore :: XParser Ignore
newIgnore =
    ( do tok TokSectionOpen
         many newIgnore `debug` "IGNORING conditional section"
         tok TokSectionClose
         return Ignore `debug` "end of IGNORED conditional section") `onFail`
    ( do t <- nottok [TokSectionOpen,TokSectionClose]
         return Ignore  `debug` ("ignoring: "++show t))

--- obsolete?
--ignoresectcontents :: XParser IgnoreSectContents
--ignoresectcontents = do
--    i <- ignore
--    is <- many (do tok TokSectionOpen
--                   ic <- ignoresectcontents
--                   tok TokSectionClose
--                   ig <- ignore
--                   return (ic,ig))
--    return (IgnoreSectContents i is)
--
--ignore :: XParser Ignore
--ignore = do
--  is <- many1 (nottok [TokSectionOpen,TokSectionClose])
--  return Ignore  `debug` ("ignored all of: "++show is)
----

-- | Return either a general entity reference, or a character reference.
reference :: XParser Reference
reference = do
    bracket (tok TokAmp) (tok TokSemi) (freetext >>= val)
  where
    val ('#':'x':i) | all isHexDigit i
                    = return . RefChar . fst . head . readHex $ i
    val ('#':i)     | all isDigit i
                    = return . RefChar . fst . head . readDec $ i
    val name        = return . RefEntity $ name

{- -- following is incorrect
reference =
    ( charref >>= return . RefChar) `onFail`
    ( entityref >>= return . RefEntity)

entityref :: XParser EntityRef
entityref = do
    bracket (tok TokAmp) (commit $ tok TokSemi) name

charref :: XParser CharRef
charref = do
    bracket (tok TokAmp) (commit $ tok TokSemi) (freetext >>= readCharVal)
  where
    readCharVal ('#':'x':i) = return . fst . head . readHex $ i
    readCharVal ('#':i)     = return . fst . head . readDec $ i
    readCharVal _           = mzero
-}

pereference :: XParser PEReference
pereference = do
    bracket (tok TokPercent) (tok TokSemi) nmtoken

entitydecl :: XParser EntityDecl
entitydecl =
    ( gedecl >>= return . EntityGEDecl) `onFail`
    ( pedecl >>= return . EntityPEDecl)

gedecl :: XParser GEDecl
gedecl = do
    tok TokSpecialOpen
    tok (TokSpecial ENTITYx)
    n <- name
    e <- entitydef `adjustErrBad` (("in general entity defn "++n++",\n")++)
    tok TokAnyClose `onFail` failBadP ("expected > terminating G ENTITY decl "++n)
    stUpdate (addGE n e) `debug` ("added GE defn &"++n++";")
    return (GEDecl n e)

pedecl :: XParser PEDecl
pedecl = do
    tok TokSpecialOpen
    tok (TokSpecial ENTITYx)
    tok TokPercent
    n <- name
    e <- pedef `adjustErrBad` (("in parameter entity defn "++n++",\n")++)
    tok TokAnyClose `onFail` failBadP ("expected > terminating P ENTITY decl "++n)
    stUpdate (addPE n e) `debug` ("added PE defn %"++n++";\n"++show e)
    return (PEDecl n e)

entitydef :: XParser EntityDef
entitydef =
    oneOf' [ ("entityvalue", entityvalue >>= return . DefEntityValue)
           , ("external",    do eid <- externalid
                                ndd <- maybe ndatadecl
                                return (DefExternalID eid ndd))
           ]

pedef :: XParser PEDef
pedef =
    oneOf' [ ("entityvalue", entityvalue >>= return . PEDefEntityValue)
           , ("externalid",  externalid  >>= return . PEDefExternalID)
           ]

externalid :: XParser ExternalID
externalid =
    oneOf' [ ("SYSTEM", do word "SYSTEM"
                           s <- systemliteral
                           return (SYSTEM s) )
           , ("PUBLIC", do word "PUBLIC"
                           p <- pubidliteral
                           s <- systemliteral
                           return (PUBLIC p s) )
           ]
      `adjustErr` ("looking for an external id,\n"++)

ndatadecl :: XParser NDataDecl
ndatadecl = do
    word "NDATA"
    n <- name
    return (NDATA n)

textdecl :: XParser TextDecl
textdecl = do
    tok TokPIOpen
    (word "xml" `onFail` word "XML")
    v <- maybe versioninfo
    e <- encodingdecl
    tok TokPIClose `onFail` failP "expected ?> terminating text decl"
    return (TextDecl v e)

--extparsedent :: XParser (ExtParsedEnt Posn)
--extparsedent = do
--    t <- maybe textdecl
--    c <- content
--    return (ExtParsedEnt t c)
--
--extpe :: XParser ExtPE
--extpe = do
--    t <- maybe textdecl
--    e <- many (peRef extsubsetdecl)
--    return (ExtPE t e)

encodingdecl :: XParser EncodingDecl
encodingdecl = do
    (word "encoding" `onFail` word "ENCODING")
    tok TokEqual `onFail` failBadP "expected = in 'encoding' decl"
    f <- bracket (tok TokQuote) (commit $ tok TokQuote) freetext
    return (EncodingDecl f)

notationdecl :: XParser NotationDecl
notationdecl = do
    tok TokSpecialOpen
    tok (TokSpecial NOTATIONx)
    n <- name
    e <- either externalid publicid
    tok TokAnyClose `onFail` failBadP ("expected > terminating NOTATION decl "++n)
    return (NOTATION n e)

publicid :: XParser PublicID
publicid = do
    word "PUBLIC"
    p <- pubidliteral
    return (PUBLICID p)

entityvalue :: XParser EntityValue
entityvalue = do
 -- evs <- bracket (tok TokQuote) (commit $ tok TokQuote) (many (peRef ev))
    tok TokQuote
    pn <- posn
    evs <- many ev
    tok TokQuote `onFail` failBadP "expected quote to terminate entityvalue"
    -- quoted text must be rescanned for possible PERefs
    st <- stGet
    Prelude.either failBad (return . EntityValue) . fst3 $
                (runParser (many ev) st
                         (reLexEntityValue (\s-> stringify (lookupPE s st))
                                           pn
                                           (flattenEV (EntityValue evs))))
  where
    stringify (Just (PEDefEntityValue ev)) = Just (flattenEV ev)
    stringify _ = Nothing

ev :: XParser EV
ev =
    oneOf' [ ("string",    (string`onFail`freetext) >>= return . EVString)
           , ("reference", reference >>= return . EVRef)
           ]
      `adjustErr` ("looking for entity value,\n"++)

attvalue :: XParser AttValue
attvalue = do
    avs <- bracket (tok TokQuote) (commit $ tok TokQuote)
                   (many (either freetext reference))
    return (AttValue avs)

systemliteral :: XParser SystemLiteral
systemliteral = do
    s <- bracket (tok TokQuote) (commit $ tok TokQuote) freetext
    return (SystemLiteral s)            -- note: refs &...; not permitted

pubidliteral :: XParser PubidLiteral
pubidliteral = do
    s <- bracket (tok TokQuote) (commit $ tok TokQuote) freetext
    return (PubidLiteral s)             -- note: freetext is too liberal here

-- | Return parsed freetext (i.e. until the next markup)
chardata :: XParser CharData
chardata = freetext