1/* Part of SWI-Prolog 2 3 Author: Jan Wielemaker 4 E-mail: J.Wielemaker@vu.nl 5 WWW: http://www.swi-prolog.org 6 Copyright (c) 2000-2020, University of Amsterdam 7 VU University Amsterdam 8 CWI, Amsterdam 9 All rights reserved. 10 11 Redistribution and use in source and binary forms, with or without 12 modification, are permitted provided that the following conditions 13 are met: 14 15 1. Redistributions of source code must retain the above copyright 16 notice, this list of conditions and the following disclaimer. 17 18 2. Redistributions in binary form must reproduce the above copyright 19 notice, this list of conditions and the following disclaimer in 20 the documentation and/or other materials provided with the 21 distribution. 22 23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 POSSIBILITY OF SUCH DAMAGE. 35*/ 36 37:- module(sgml, 38 [ load_html/3, % +Input, -DOM, +Options 39 load_xml/3, % +Input, -DOM, +Options 40 load_sgml/3, % +Input, -DOM, +Options 41 42 load_sgml_file/2, % +File, -ListOfContent 43 load_xml_file/2, % +File, -ListOfContent 44 load_html_file/2, % +File, -Document 45 46 load_structure/3, % +File, -Term, +Options 47 48 load_dtd/2, % +DTD, +File 49 load_dtd/3, % +DTD, +File, +Options 50 dtd/2, % +Type, -DTD 51 dtd_property/2, % +DTD, ?Property 52 53 new_dtd/2, % +Doctype, -DTD 54 free_dtd/1, % +DTD 55 open_dtd/3, % +DTD, +Options, -Stream 56 57 new_sgml_parser/2, % -Parser, +Options 58 free_sgml_parser/1, % +Parser 59 set_sgml_parser/2, % +Parser, +Options 60 get_sgml_parser/2, % +Parser, +Options 61 sgml_parse/2, % +Parser, +Options 62 63 sgml_register_catalog_file/2, % +File, +StartOrEnd 64 65 xml_quote_attribute/3, % +In, -Quoted, +Encoding 66 xml_quote_cdata/3, % +In, -Quoted, +Encoding 67 xml_quote_attribute/2, % +In, -Quoted 68 xml_quote_cdata/2, % +In, -Quoted 69 xml_name/1, % +In 70 xml_name/2, % +In, +Encoding 71 72 xsd_number_string/2, % ?Number, ?String 73 xsd_time_string/3, % ?Term, ?Type, ?String 74 75 xml_basechar/1, % +Code 76 xml_ideographic/1, % +Code 77 xml_combining_char/1, % +Code 78 xml_digit/1, % +Code 79 xml_extender/1, % +Code 80 81 iri_xml_namespace/2, % +IRI, -Namespace 82 iri_xml_namespace/3, % +IRI, -Namespace, -LocalName 83 xml_is_dom/1 % +Term 84 ]). 85:- autoload(library(error),[instantiation_error/1]). 86:- autoload(library(iostream),[open_any/5,close_any/1]). 87:- autoload(library(lists),[member/2,selectchk/3]). 88:- autoload(library(option),[select_option/3,merge_options/3]). 89 90:- meta_predicate 91 load_structure( , , ), 92 load_html( , , ), 93 load_xml( , , ), 94 load_sgml( , , ). 95 96:- predicate_options(load_structure/3, 3, 97 [ charpos(integer), 98 cdata(oneof([atom,string])), 99 defaults(boolean), 100 dialect(oneof([html,html4,html5,sgml,xhtml,xhtml5,xml,xmlns])), 101 doctype(atom), 102 dtd(any), 103 encoding(oneof(['iso-8859-1', 'utf-8', 'us-ascii'])), 104 entity(atom,atom), 105 keep_prefix(boolean), 106 file(atom), 107 line(integer), 108 offset(integer), 109 number(oneof([token,integer])), 110 qualify_attributes(boolean), 111 shorttag(boolean), 112 case_sensitive_attributes(boolean), 113 case_preserving_attributes(boolean), 114 system_entities(boolean), 115 max_memory(integer), 116 space(oneof([sgml,preserve,default,remove,strict])), 117 xmlns(atom), 118 xmlns(atom,atom), 119 pass_to(sgml_parse/2, 2) 120 ]). 121:- predicate_options(load_html/3, 3, 122 [ pass_to(load_structure/3, 3) 123 ]). 124:- predicate_options(load_xml/3, 3, 125 [ pass_to(load_structure/3, 3) 126 ]). 127:- predicate_options(load_sgml/3, 3, 128 [ pass_to(load_structure/3, 3) 129 ]). 130:- predicate_options(load_dtd/3, 3, 131 [ dialect(oneof([sgml,xml,xmlns])), 132 pass_to(open/4, 4) 133 ]). 134:- predicate_options(sgml_parse/2, 2, 135 [ call(oneof([begin,end,cdata,pi,decl,error,xmlns,urlns]), 136 callable), 137 cdata(oneof([atom,string])), 138 content_length(integer), 139 document(-any), 140 max_errors(integer), 141 parse(oneof([file,element,content,declaration,input])), 142 source(any), 143 syntax_errors(oneof([quiet,print,style])), 144 xml_no_ns(oneof([error,quiet])) 145 ]). 146:- predicate_options(new_sgml_parser/2, 2, 147 [ dtd(any) 148 ]).
178:- multifile user:file_search_path/2. 179:- dynamic user:file_search_path/2. 180 181user:file_search_path(dtd, '.'). 182user:file_search_path(dtd, swi('library/DTD')). 183 184sgml_register_catalog_file(File, Location) :- 185 prolog_to_os_filename(File, OsFile), 186 '_sgml_register_catalog_file'(OsFile, Location). 187 188:- use_foreign_library(foreign(sgml2pl)). 189 190register_catalog(Base) :- 191 absolute_file_name(dtd(Base), 192 [ extensions([soc]), 193 access(read), 194 file_errors(fail) 195 ], 196 SocFile), 197 sgml_register_catalog_file(SocFile, end). 198 199:- initialization 200 ignore(register_catalog('HTML4')). 201 202 203 /******************************* 204 * DTD HANDLING * 205 *******************************/ 206 207/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 208Note that concurrent access to DTD objects is not allowed, and hence we 209will allocate and destroy them in each thread. Possibibly it would be 210nicer to find out why concurrent access to DTD's is flawed. It is 211diagnosed to mess with the entity resolution by Fabien Todescato. 212- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ 213 214:- thread_local 215 current_dtd/2. 216:- volatile 217 current_dtd/2. 218:- thread_local 219 registered_cleanup/0. 220:- volatile 221 registered_cleanup/0. 222 223:- multifile 224 dtd_alias/2. 225 226:- create_prolog_flag(html_dialect, html5, [type(atom)]). 227 228dtd_alias(html4, 'HTML4'). 229dtd_alias(html5, 'HTML5'). 230dtd_alias(html, DTD) :- 231 current_prolog_flag(html_dialect, Dialect), 232 dtd_alias(Dialect, DTD).
dtd(Type)
. Loaded DTD
objects are cached. Note that DTD objects may not be shared
between threads. Therefore, dtd/2 maintains the pool of DTD
objects using a thread_local predicate. DTD objects are
destroyed if a thread terminates.
244dtd(Type, DTD) :- 245 current_dtd(Type, DTD), 246 !. 247dtd(Type, DTD) :- 248 new_dtd(Type, DTD), 249 ( dtd_alias(Type, Base) 250 -> true 251 ; Base = Type 252 ), 253 absolute_file_name(dtd(Base), 254 [ extensions([dtd]), 255 access(read) 256 ], DtdFile), 257 load_dtd(DTD, DtdFile), 258 register_cleanup, 259 asserta(current_dtd(Type, DTD)).
274load_dtd(DTD, DtdFile) :- 275 load_dtd(DTD, DtdFile, []). 276load_dtd(DTD, DtdFile, Options) :- 277 sgml_open_options(sgml:Options, OpenOptions, sgml:DTDOptions), 278 setup_call_cleanup( 279 open_dtd(DTD, DTDOptions, DtdOut), 280 setup_call_cleanup( 281 open(DtdFile, read, DtdIn, OpenOptions), 282 copy_stream_data(DtdIn, DtdOut), 283 close(DtdIn)), 284 close(DtdOut)).
291:- public 292 destroy_dtds/0. % called through register_cleanup/0 293 294destroy_dtds :- 295 ( current_dtd(_Type, DTD), 296 free_dtd(DTD), 297 fail 298 ; true 299 ).
305register_cleanup :- 306 registered_cleanup, 307 !. 308register_cleanup :- 309 ( thread_self(main) 310 -> at_halt(destroy_dtds) 311 ; current_prolog_flag(threads, true) 312 -> prolog_listen(this_thread_exit, destroy_dtds) 313 ; true 314 ), 315 assert(registered_cleanup). 316 317 318 /******************************* 319 * EXAMINE DTD * 320 *******************************/ 321 322prop(doctype(_), _). 323prop(elements(_), _). 324prop(entities(_), _). 325prop(notations(_), _). 326prop(entity(E, _), DTD) :- 327 ( nonvar(E) 328 -> true 329 ; '$dtd_property'(DTD, entities(EL)), 330 member(E, EL) 331 ). 332prop(element(E, _, _), DTD) :- 333 ( nonvar(E) 334 -> true 335 ; '$dtd_property'(DTD, elements(EL)), 336 member(E, EL) 337 ). 338prop(attributes(E, _), DTD) :- 339 ( nonvar(E) 340 -> true 341 ; '$dtd_property'(DTD, elements(EL)), 342 member(E, EL) 343 ). 344prop(attribute(E, A, _, _), DTD) :- 345 ( nonvar(E) 346 -> true 347 ; '$dtd_property'(DTD, elements(EL)), 348 member(E, EL) 349 ), 350 ( nonvar(A) 351 -> true 352 ; '$dtd_property'(DTD, attributes(E, AL)), 353 member(A, AL) 354 ). 355prop(notation(N, _), DTD) :- 356 ( nonvar(N) 357 -> true 358 ; '$dtd_property'(DTD, notations(NL)), 359 member(N, NL) 360 ). 361 362dtd_property(DTD, Prop) :- 363 prop(Prop, DTD), 364 '$dtd_property'(DTD, Prop). 365 366 367 /******************************* 368 * SGML * 369 *******************************/
A proper XML document contains only a single toplevel element whose name matches the document type. Nevertheless, a list is returned for consistency with the representation of element content.
The encoding(+Encoding)
option is treated special for
compatibility reasons:
iso-8859-1
, us-ascii
or utf-8
,
the stream is opened in binary mode and the option is passed
to the SGML parser.393load_structure(Spec, DOM, Options) :- 394 sgml_open_options(Options, OpenOptions, SGMLOptions), 395 setup_call_cleanup( 396 open_any(Spec, read, In, Close, OpenOptions), 397 load_structure_from_stream(In, DOM, SGMLOptions), 398 close_any(Close)). 399 400sgml_open_options(Options, OpenOptions, SGMLOptions) :- 401 Options = M:Plain, 402 ( select_option(encoding(Encoding), Plain, NoEnc) 403 -> ( sgml_encoding(Encoding) 404 -> merge_options(NoEnc, [type(binary)], OpenOptions), 405 SGMLOptions = Options 406 ; OpenOptions = Plain, 407 SGMLOptions = M:NoEnc 408 ) 409 ; merge_options(Plain, [type(binary)], OpenOptions), 410 SGMLOptions = Options 411 ). 412 413sgml_encoding(Enc) :- 414 downcase_atom(Enc, Enc1), 415 sgml_encoding_l(Enc1). 416 417sgml_encoding_l('iso-8859-1'). 418sgml_encoding_l('us-ascii'). 419sgml_encoding_l('utf-8'). 420sgml_encoding_l('utf8'). 421sgml_encoding_l('iso_latin_1'). 422sgml_encoding_l('ascii'). 423 424load_structure_from_stream(In, Term, M:Options) :- 425 ( select_option(dtd(DTD), Options, Options1) 426 -> ExplicitDTD = true 427 ; ExplicitDTD = false, 428 Options1 = Options 429 ), 430 move_front(Options1, dialect(_), Options2), % dialect sets defaults 431 setup_call_cleanup( 432 new_sgml_parser(Parser, 433 [ dtd(DTD) 434 ]), 435 parse(Parser, M:Options2, TermRead, In), 436 free_sgml_parser(Parser)), 437 ( ExplicitDTD == true 438 -> ( DTD = dtd(_, DocType), 439 dtd_property(DTD, doctype(DocType)) 440 -> true 441 ; true 442 ) 443 ; free_dtd(DTD) 444 ), 445 Term = TermRead. 446 447move_front(Options0, Opt, Options) :- 448 selectchk(Opt, Options0, Options1), 449 !, 450 Options = [Opt|Options1]. 451move_front(Options, _, Options). 452 453 454parse(Parser, M:Options, Document, In) :- 455 set_parser_options(Options, Parser, In, Options1), 456 parser_meta_options(Options1, M, Options2), 457 set_input_location(Parser, In), 458 sgml_parse(Parser, 459 [ document(Document), 460 source(In) 461 | Options2 462 ]). 463 464set_parser_options([], _, _, []). 465set_parser_options([H|T], Parser, In, Rest) :- 466 ( set_parser_option(H, Parser, In) 467 -> set_parser_options(T, Parser, In, Rest) 468 ; Rest = [H|R2], 469 set_parser_options(T, Parser, In, R2) 470 ). 471 472set_parser_option(Var, _Parser, _In) :- 473 var(Var), 474 !, 475 instantiation_error(Var). 476set_parser_option(Option, Parser, _) :- 477 def_entity(Option, Parser), 478 !. 479set_parser_option(offset(Offset), _Parser, In) :- 480 !, 481 seek(In, Offset, bof, _). 482set_parser_option(Option, Parser, _In) :- 483 parser_option(Option), 484 !, 485 set_sgml_parser(Parser, Option). 486set_parser_option(Name=Value, Parser, In) :- 487 Option =.. [Name,Value], 488 set_parser_option(Option, Parser, In). 489 490 491parser_option(dialect(_)). 492parser_option(shorttag(_)). 493parser_option(case_sensitive_attributes(_)). 494parser_option(case_preserving_attributes(_)). 495parser_option(system_entities(_)). 496parser_option(max_memory(_)). 497parser_option(file(_)). 498parser_option(line(_)). 499parser_option(space(_)). 500parser_option(number(_)). 501parser_option(defaults(_)). 502parser_option(doctype(_)). 503parser_option(qualify_attributes(_)). 504parser_option(encoding(_)). 505parser_option(keep_prefix(_)). 506 507 508def_entity(entity(Name, Value), Parser) :- 509 get_sgml_parser(Parser, dtd(DTD)), 510 xml_quote_attribute(Value, QValue), 511 setup_call_cleanup(open_dtd(DTD, [], Stream), 512 format(Stream, '<!ENTITY ~w "~w">~n', 513 [Name, QValue]), 514 close(Stream)). 515def_entity(xmlns(URI), Parser) :- 516 set_sgml_parser(Parser, xmlns(URI)). 517def_entity(xmlns(NS, URI), Parser) :- 518 set_sgml_parser(Parser, xmlns(NS, URI)).
524parser_meta_options([], _, []). 525parser_meta_options([call(When, Closure)|T0], M, [call(When, M:Closure)|T]) :- 526 !, 527 parser_meta_options(T0, M, T). 528parser_meta_options([H|T0], M, [H|T]) :- 529 parser_meta_options(T0, M, T).
536set_input_location(Parser, _In) :- 537 get_sgml_parser(Parser, file(_)), 538 !. 539set_input_location(Parser, In) :- 540 stream_property(In, file_name(File)), 541 !, 542 set_sgml_parser(Parser, file(File)), 543 stream_property(In, position(Pos)), 544 set_sgml_parser(Parser, position(Pos)). 545set_input_location(_, _). 546 547 /******************************* 548 * UTILITIES * 549 *******************************/
558load_sgml_file(File, Term) :-
559 load_sgml(File, Term, []).
568load_xml_file(File, Term) :-
569 load_xml(File, Term, []).
578load_html_file(File, DOM) :-
579 load_html(File, DOM, []).
dtd(html, DTD)
.html_dialect
You may also want to use the library(http/http_open) to support loading from HTTP and HTTPS URLs. For example:
:- use_module(library(http/http_open)). :- use_module(library(sgml)). load_html_url(URL, DOM) :- load_html(URL, DOM, []).
608load_html(File, Term, M:Options) :-
609 current_prolog_flag(html_dialect, Dialect),
610 dtd(Dialect, DTD),
611 merge_options(Options,
612 [ dtd(DTD),
613 dialect(Dialect),
614 max_errors(-1),
615 syntax_errors(quiet)
616 ], Options1),
617 load_structure(File, Term, M:Options1).
dialect(xml)
627load_xml(Input, DOM, M:Options) :-
628 merge_options(Options,
629 [ dialect(xml)
630 ], Options1),
631 load_structure(Input, DOM, M:Options1).
dialect(sgml)
641load_sgml(Input, DOM, M:Options) :- 642 merge_options(Options, 643 [ dialect(sgml) 644 ], Options1), 645 load_structure(Input, DOM, M:Options1). 646 647 648 649 /******************************* 650 * ENCODING * 651 *******************************/
661xml_quote_attribute(In, Quoted) :- 662 xml_quote_attribute(In, Quoted, ascii). 663 664xml_quote_cdata(In, Quoted) :- 665 xml_quote_cdata(In, Quoted, ascii).
671xml_name(In) :- 672 xml_name(In, ascii). 673 674 675 /******************************* 676 * XML CHARACTER CLASSES * 677 *******************************/
692 /******************************* 693 * TYPE CHECKING * 694 *******************************/
701xml_is_dom(0) :- !, fail. % catch variables 702xml_is_dom(List) :- 703 is_list(List), 704 !, 705 xml_is_content_list(List). 706xml_is_dom(Term) :- 707 xml_is_element(Term). 708 709xml_is_content_list([]). 710xml_is_content_list([H|T]) :- 711 xml_is_content(H), 712 xml_is_content_list(T). 713 714xml_is_content(0) :- !, fail. 715xml_is_content(pi(Pi)) :- 716 !, 717 atom(Pi). 718xml_is_content(CDATA) :- 719 atom(CDATA), 720 !. 721xml_is_content(CDATA) :- 722 string(CDATA), 723 !. 724xml_is_content(Term) :- 725 xml_is_element(Term). 726 727xml_is_element(element(Name, Attributes, Content)) :- 728 dom_name(Name), 729 dom_attributes(Attributes), 730 xml_is_content_list(Content). 731 732dom_name(NS:Local) :- 733 atom(NS), 734 atom(Local), 735 !. 736dom_name(Local) :- 737 atom(Local). 738 739dom_attributes(0) :- !, fail. 740dom_attributes([]). 741dom_attributes([H|T]) :- 742 dom_attribute(H), 743 dom_attributes(T). 744 745dom_attribute(Name=Value) :- 746 dom_name(Name), 747 atomic(Value). 748 749 750 /******************************* 751 * MESSAGES * 752 *******************************/ 753:- multifile 754 prolog:message/3. 755 756% Catch messages. sgml/4 is generated by the SGML2PL binding. 757 758prologmessage(sgml(Parser, File, Line, Message)) --> 759 { get_sgml_parser(Parser, dialect(Dialect)) 760 }, 761 [ 'SGML2PL(~w): ~w:~w: ~w'-[Dialect, File, Line, Message] ]. 762 763 764 /******************************* 765 * XREF SUPPORT * 766 *******************************/ 767 768:- multifile 769 prolog:called_by/2. 770 771prologcalled_by(sgml_parse(_, Options), Called) :- 772 findall(Meta, meta_call_term(_, Meta, Options), Called). 773 774meta_call_term(T, G+N, Options) :- 775 T = call(Event, G), 776 pmember(T, Options), 777 call_params(Event, Term), 778 functor(Term, _, N). 779 780pmember(X, List) :- % member for partial lists 781 nonvar(List), 782 List = [H|T], 783 ( X = H 784 ; pmember(X, T) 785 ). 786 787call_params(begin, begin(tag,attributes,parser)). 788call_params(end, end(tag,parser)). 789call_params(cdata, cdata(cdata,parser)). 790call_params(pi, pi(cdata,parser)). 791call_params(decl, decl(cdata,parser)). 792call_params(error, error(severity,message,parser)). 793call_params(xmlns, xmlns(namespace,url,parser)). 794call_params(urlns, urlns(url,url,parser)). 795 796 /******************************* 797 * SANDBOX * 798 *******************************/ 799 800:- multifile 801 sandbox:safe_primitive/1, 802 sandbox:safe_meta_predicate/1. 803 804sandbox:safe_meta_predicate(sgml:load_structure/3). 805sandbox:safe_primitive(sgml:dtd(Dialect, _)) :- 806 dtd_alias(Dialect, _). 807sandbox:safe_primitive(sgml:xml_quote_attribute(_,_,_)). 808sandbox:safe_primitive(sgml:xml_quote_cdata(_,_,_)). 809sandbox:safe_primitive(sgml:xml_name(_,_)). 810sandbox:safe_primitive(sgml:xml_basechar(_)). 811sandbox:safe_primitive(sgml:xml_ideographic(_)). 812sandbox:safe_primitive(sgml:xml_combining_char(_)). 813sandbox:safe_primitive(sgml:xml_digit(_)). 814sandbox:safe_primitive(sgml:xml_extender(_)). 815sandbox:safe_primitive(sgml:iri_xml_namespace(_,_,_)). 816sandbox:safe_primitive(sgml:xsd_number_string(_,_)). 817sandbox:safe_primitive(sgml:xsd_time_string(_,_,_))
SGML, XML and HTML parser
This library allows you to parse SGML, XML and HTML data into a Prolog data structure. The library defines several families of predicates:
The DOM structure can be used by library(xpath) to extract information from the document.