1/* Part of SWI-Prolog 2 3 Author: Jan Wielemaker 4 E-mail: J.Wielemaker@vu.nl 5 WWW: http://www.swi-prolog.org 6 Copyright (c) 2000-2020, University of Amsterdam 7 VU University Amsterdam 8 CWI, Amsterdam 9 All rights reserved. 10 11 Redistribution and use in source and binary forms, with or without 12 modification, are permitted provided that the following conditions 13 are met: 14 15 1. Redistributions of source code must retain the above copyright 16 notice, this list of conditions and the following disclaimer. 17 18 2. Redistributions in binary form must reproduce the above copyright 19 notice, this list of conditions and the following disclaimer in 20 the documentation and/or other materials provided with the 21 distribution. 22 23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 POSSIBILITY OF SUCH DAMAGE. 35*/ 36 37:- module(sgml, 38 [ load_html/3, % +Input, -DOM, +Options 39 load_xml/3, % +Input, -DOM, +Options 40 load_sgml/3, % +Input, -DOM, +Options 41 42 load_sgml_file/2, % +File, -ListOfContent 43 load_xml_file/2, % +File, -ListOfContent 44 load_html_file/2, % +File, -Document 45 46 load_structure/3, % +File, -Term, +Options 47 48 load_dtd/2, % +DTD, +File 49 load_dtd/3, % +DTD, +File, +Options 50 dtd/2, % +Type, -DTD 51 dtd_property/2, % +DTD, ?Property 52 53 new_dtd/2, % +Doctype, -DTD 54 free_dtd/1, % +DTD 55 open_dtd/3, % +DTD, +Options, -Stream 56 57 new_sgml_parser/2, % -Parser, +Options 58 free_sgml_parser/1, % +Parser 59 set_sgml_parser/2, % +Parser, +Options 60 get_sgml_parser/2, % +Parser, +Options 61 sgml_parse/2, % +Parser, +Options 62 63 sgml_register_catalog_file/2, % +File, +StartOrEnd 64 65 xml_quote_attribute/3, % +In, -Quoted, +Encoding 66 xml_quote_cdata/3, % +In, -Quoted, +Encoding 67 xml_quote_attribute/2, % +In, -Quoted 68 xml_quote_cdata/2, % +In, -Quoted 69 xml_name/1, % +In 70 xml_name/2, % +In, +Encoding 71 72 xsd_number_string/2, % ?Number, ?String 73 xsd_time_string/3, % ?Term, ?Type, ?String 74 75 xml_basechar/1, % +Code 76 xml_ideographic/1, % +Code 77 xml_combining_char/1, % +Code 78 xml_digit/1, % +Code 79 xml_extender/1, % +Code 80 81 iri_xml_namespace/2, % +IRI, -Namespace 82 iri_xml_namespace/3, % +IRI, -Namespace, -LocalName 83 xml_is_dom/1 % +Term 84 ]). 85:- autoload(library(error),[instantiation_error/1]). 86:- autoload(library(iostream),[open_any/5,close_any/1]). 87:- autoload(library(lists),[member/2,selectchk/3]). 88:- autoload(library(option),[select_option/3,merge_options/3]). 89 90:- meta_predicate 91 load_structure( , , ), 92 load_html( , , ), 93 load_xml( , , ), 94 load_sgml( , , ). 95 96:- predicate_options(load_structure/3, 3, 97 [ charpos(integer), 98 cdata(oneof([atom,string])), 99 defaults(boolean), 100 dialect(oneof([html,html4,html5,sgml,xhtml,xhtml5,xml,xmlns])), 101 doctype(atom), 102 dtd(any), 103 encoding(oneof(['iso-8859-1', 'utf-8', 'us-ascii'])), 104 entity(atom,atom), 105 keep_prefix(boolean), 106 file(atom), 107 line(integer), 108 offset(integer), 109 number(oneof([token,integer])), 110 qualify_attributes(boolean), 111 shorttag(boolean), 112 case_sensitive_attributes(boolean), 113 case_preserving_attributes(boolean), 114 system_entities(boolean), 115 max_memory(integer), 116 ignore_doctype(boolean), 117 space(oneof([sgml,preserve,default,remove,strict])), 118 xmlns(atom), 119 xmlns(atom,atom), 120 pass_to(sgml_parse/2, 2) 121 ]). 122:- predicate_options(load_html/3, 3, 123 [ pass_to(load_structure/3, 3) 124 ]). 125:- predicate_options(load_xml/3, 3, 126 [ pass_to(load_structure/3, 3) 127 ]). 128:- predicate_options(load_sgml/3, 3, 129 [ pass_to(load_structure/3, 3) 130 ]). 131:- predicate_options(load_dtd/3, 3, 132 [ dialect(oneof([sgml,xml,xmlns])), 133 pass_to(open/4, 4) 134 ]). 135:- predicate_options(sgml_parse/2, 2, 136 [ call(oneof([begin,end,cdata,pi,decl,error,xmlns,urlns]), 137 callable), 138 cdata(oneof([atom,string])), 139 content_length(integer), 140 document(-any), 141 max_errors(integer), 142 parse(oneof([file,element,content,declaration,input])), 143 source(any), 144 syntax_errors(oneof([quiet,print,style])), 145 xml_no_ns(oneof([error,quiet])) 146 ]). 147:- predicate_options(new_sgml_parser/2, 2, 148 [ dtd(any) 149 ]).
179:- multifile user:file_search_path/2. 180:- dynamic user:file_search_path/2. 181 182user:file_search_path(dtd, '.'). 183user:file_search_path(dtd, swi('library/DTD')). 184 185sgml_register_catalog_file(File, Location) :- 186 prolog_to_os_filename(File, OsFile), 187 '_sgml_register_catalog_file'(OsFile, Location). 188 189:- use_foreign_library(foreign(sgml2pl)). 190 191register_catalog(Base) :- 192 absolute_file_name(dtd(Base), 193 [ extensions([soc]), 194 access(read), 195 file_errors(fail) 196 ], 197 SocFile), 198 sgml_register_catalog_file(SocFile, end). 199 200:- initialization 201 ignore(register_catalog('HTML4')). 202 203 204 /******************************* 205 * DTD HANDLING * 206 *******************************/ 207 208/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 209Note that concurrent access to DTD objects is not allowed, and hence we 210will allocate and destroy them in each thread. Possibibly it would be 211nicer to find out why concurrent access to DTD's is flawed. It is 212diagnosed to mess with the entity resolution by Fabien Todescato. 213- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ 214 215:- thread_local 216 current_dtd/2. 217:- volatile 218 current_dtd/2. 219:- thread_local 220 registered_cleanup/0. 221:- volatile 222 registered_cleanup/0. 223 224:- multifile 225 dtd_alias/2. 226 227:- create_prolog_flag(html_dialect, html5, [type(atom)]). 228 229dtd_alias(html4, 'HTML4'). 230dtd_alias(html5, 'HTML5'). 231dtd_alias(html, DTD) :- 232 current_prolog_flag(html_dialect, Dialect), 233 dtd_alias(Dialect, DTD).
dtd(Type)
. Loaded DTD
objects are cached. Note that DTD objects may not be shared
between threads. Therefore, dtd/2 maintains the pool of DTD
objects using a thread_local predicate. DTD objects are
destroyed if a thread terminates.
245dtd(Type, DTD) :- 246 current_dtd(Type, DTD), 247 !. 248dtd(Type, DTD) :- 249 new_dtd(Type, DTD), 250 ( dtd_alias(Type, Base) 251 -> true 252 ; Base = Type 253 ), 254 absolute_file_name(dtd(Base), 255 [ extensions([dtd]), 256 access(read) 257 ], DtdFile), 258 load_dtd(DTD, DtdFile), 259 register_cleanup, 260 asserta(current_dtd(Type, DTD)).
275load_dtd(DTD, DtdFile) :- 276 load_dtd(DTD, DtdFile, []). 277load_dtd(DTD, DtdFile, Options) :- 278 sgml_open_options(sgml:Options, OpenOptions, sgml:DTDOptions), 279 setup_call_cleanup( 280 open_dtd(DTD, DTDOptions, DtdOut), 281 setup_call_cleanup( 282 open(DtdFile, read, DtdIn, OpenOptions), 283 copy_stream_data(DtdIn, DtdOut), 284 close(DtdIn)), 285 close(DtdOut)).
292destroy_dtds :-
293 ( current_dtd(_Type, DTD),
294 free_dtd(DTD),
295 fail
296 ; true
297 ).
303register_cleanup :- 304 registered_cleanup, 305 !. 306register_cleanup :- 307 ( thread_self(main) 308 -> at_halt(destroy_dtds) 309 ; current_prolog_flag(threads, true) 310 -> prolog_listen(this_thread_exit, destroy_dtds) 311 ; true 312 ), 313 assert(registered_cleanup). 314 315 316 /******************************* 317 * EXAMINE DTD * 318 *******************************/ 319 320prop(doctype(_), _). 321prop(elements(_), _). 322prop(entities(_), _). 323prop(notations(_), _). 324prop(entity(E, _), DTD) :- 325 ( nonvar(E) 326 -> true 327 ; '$dtd_property'(DTD, entities(EL)), 328 member(E, EL) 329 ). 330prop(element(E, _, _), DTD) :- 331 ( nonvar(E) 332 -> true 333 ; '$dtd_property'(DTD, elements(EL)), 334 member(E, EL) 335 ). 336prop(attributes(E, _), DTD) :- 337 ( nonvar(E) 338 -> true 339 ; '$dtd_property'(DTD, elements(EL)), 340 member(E, EL) 341 ). 342prop(attribute(E, A, _, _), DTD) :- 343 ( nonvar(E) 344 -> true 345 ; '$dtd_property'(DTD, elements(EL)), 346 member(E, EL) 347 ), 348 ( nonvar(A) 349 -> true 350 ; '$dtd_property'(DTD, attributes(E, AL)), 351 member(A, AL) 352 ). 353prop(notation(N, _), DTD) :- 354 ( nonvar(N) 355 -> true 356 ; '$dtd_property'(DTD, notations(NL)), 357 member(N, NL) 358 ). 359 360dtd_property(DTD, Prop) :- 361 prop(Prop, DTD), 362 '$dtd_property'(DTD, Prop). 363 364 365 /******************************* 366 * SGML * 367 *******************************/
A proper XML document contains only a single toplevel element whose name matches the document type. Nevertheless, a list is returned for consistency with the representation of element content.
The encoding(+Encoding)
option is treated special for
compatibility reasons:
iso-8859-1
, us-ascii
or utf-8
,
the stream is opened in binary mode and the option is passed
to the SGML parser.391load_structure(Spec, DOM, Options) :- 392 sgml_open_options(Options, OpenOptions, SGMLOptions), 393 setup_call_cleanup( 394 open_any(Spec, read, In, Close, OpenOptions), 395 load_structure_from_stream(In, DOM, SGMLOptions), 396 close_any(Close)). 397 398sgml_open_options(Options, OpenOptions, SGMLOptions) :- 399 Options = M:Plain, 400 ( select_option(encoding(Encoding), Plain, NoEnc) 401 -> ( sgml_encoding(Encoding) 402 -> merge_options(NoEnc, [type(binary)], OpenOptions), 403 SGMLOptions = Options 404 ; OpenOptions = Plain, 405 SGMLOptions = M:NoEnc 406 ) 407 ; merge_options(Plain, [type(binary)], OpenOptions), 408 SGMLOptions = Options 409 ). 410 411sgml_encoding(Enc) :- 412 downcase_atom(Enc, Enc1), 413 sgml_encoding_l(Enc1). 414 415sgml_encoding_l('iso-8859-1'). 416sgml_encoding_l('us-ascii'). 417sgml_encoding_l('utf-8'). 418sgml_encoding_l('utf8'). 419sgml_encoding_l('iso_latin_1'). 420sgml_encoding_l('ascii'). 421 422load_structure_from_stream(In, Term, M:Options) :- 423 ( select_option(dtd(DTD), Options, Options1) 424 -> ExplicitDTD = true 425 ; ExplicitDTD = false, 426 Options1 = Options 427 ), 428 move_front(Options1, dialect(_), Options2), % dialect sets defaults 429 setup_call_cleanup( 430 new_sgml_parser(Parser, 431 [ dtd(DTD) 432 ]), 433 parse(Parser, M:Options2, TermRead, In), 434 free_sgml_parser(Parser)), 435 ( ExplicitDTD == true 436 -> ( DTD = dtd(_, DocType), 437 dtd_property(DTD, doctype(DocType)) 438 -> true 439 ; true 440 ) 441 ; free_dtd(DTD) 442 ), 443 Term = TermRead. 444 445move_front(Options0, Opt, Options) :- 446 selectchk(Opt, Options0, Options1), 447 !, 448 Options = [Opt|Options1]. 449move_front(Options, _, Options). 450 451 452parse(Parser, M:Options, Document, In) :- 453 set_parser_options(Options, Parser, In, Options1), 454 parser_meta_options(Options1, M, Options2), 455 set_input_location(Parser, In), 456 sgml_parse(Parser, 457 [ document(Document), 458 source(In) 459 | Options2 460 ]). 461 462set_parser_options([], _, _, []). 463set_parser_options([H|T], Parser, In, Rest) :- 464 ( set_parser_option(H, Parser, In) 465 -> set_parser_options(T, Parser, In, Rest) 466 ; Rest = [H|R2], 467 set_parser_options(T, Parser, In, R2) 468 ). 469 470set_parser_option(Var, _Parser, _In) :- 471 var(Var), 472 !, 473 instantiation_error(Var). 474set_parser_option(Option, Parser, _) :- 475 def_entity(Option, Parser), 476 !. 477set_parser_option(offset(Offset), _Parser, In) :- 478 !, 479 seek(In, Offset, bof, _). 480set_parser_option(Option, Parser, _In) :- 481 parser_option(Option), 482 !, 483 set_sgml_parser(Parser, Option). 484set_parser_option(Name=Value, Parser, In) :- 485 Option =.. [Name,Value], 486 set_parser_option(Option, Parser, In). 487 488 489parser_option(dialect(_)). 490parser_option(shorttag(_)). 491parser_option(case_sensitive_attributes(_)). 492parser_option(case_preserving_attributes(_)). 493parser_option(system_entities(_)). 494parser_option(max_memory(_)). 495parser_option(ignore_doctype(_)). 496parser_option(file(_)). 497parser_option(line(_)). 498parser_option(space(_)). 499parser_option(number(_)). 500parser_option(defaults(_)). 501parser_option(doctype(_)). 502parser_option(qualify_attributes(_)). 503parser_option(encoding(_)). 504parser_option(keep_prefix(_)). 505 506 507def_entity(entity(Name, Value), Parser) :- 508 get_sgml_parser(Parser, dtd(DTD)), 509 xml_quote_attribute(Value, QValue), 510 setup_call_cleanup(open_dtd(DTD, [], Stream), 511 format(Stream, '<!ENTITY ~w "~w">~n', 512 [Name, QValue]), 513 close(Stream)). 514def_entity(xmlns(URI), Parser) :- 515 set_sgml_parser(Parser, xmlns(URI)). 516def_entity(xmlns(NS, URI), Parser) :- 517 set_sgml_parser(Parser, xmlns(NS, URI)).
523parser_meta_options([], _, []). 524parser_meta_options([call(When, Closure)|T0], M, [call(When, M:Closure)|T]) :- 525 !, 526 parser_meta_options(T0, M, T). 527parser_meta_options([H|T0], M, [H|T]) :- 528 parser_meta_options(T0, M, T).
535set_input_location(Parser, _In) :- 536 get_sgml_parser(Parser, file(_)), 537 !. 538set_input_location(Parser, In) :- 539 stream_property(In, file_name(File)), 540 !, 541 set_sgml_parser(Parser, file(File)), 542 stream_property(In, position(Pos)), 543 set_sgml_parser(Parser, position(Pos)). 544set_input_location(_, _). 545 546 /******************************* 547 * UTILITIES * 548 *******************************/
557load_sgml_file(File, Term) :-
558 load_sgml(File, Term, []).
567load_xml_file(File, Term) :-
568 load_xml(File, Term, []).
577load_html_file(File, DOM) :-
578 load_html(File, DOM, []).
dtd(html, DTD)
.html_dialect
You may also want to use the library(http/http_open) to support loading from HTTP and HTTPS URLs. For example:
:- use_module(library(http/http_open)). :- use_module(library(sgml)). load_html_url(URL, DOM) :- load_html(URL, DOM, []).
607load_html(File, Term, M:Options) :-
608 current_prolog_flag(html_dialect, Dialect),
609 dtd(Dialect, DTD),
610 merge_options(Options,
611 [ dtd(DTD),
612 dialect(Dialect),
613 max_errors(-1),
614 syntax_errors(quiet)
615 ], Options1),
616 load_structure(File, Term, M:Options1).
dialect(xml)
626load_xml(Input, DOM, M:Options) :-
627 merge_options(Options,
628 [ dialect(xml)
629 ], Options1),
630 load_structure(Input, DOM, M:Options1).
dialect(sgml)
640load_sgml(Input, DOM, M:Options) :- 641 merge_options(Options, 642 [ dialect(sgml) 643 ], Options1), 644 load_structure(Input, DOM, M:Options1). 645 646 647 648 /******************************* 649 * ENCODING * 650 *******************************/
660xml_quote_attribute(In, Quoted) :- 661 xml_quote_attribute(In, Quoted, ascii). 662 663xml_quote_cdata(In, Quoted) :- 664 xml_quote_cdata(In, Quoted, ascii).
670xml_name(In) :- 671 xml_name(In, ascii). 672 673 674 /******************************* 675 * XML CHARACTER CLASSES * 676 *******************************/
691 /******************************* 692 * TYPE CHECKING * 693 *******************************/
700xml_is_dom(0) :- !, fail. % catch variables 701xml_is_dom(List) :- 702 is_list(List), 703 !, 704 xml_is_content_list(List). 705xml_is_dom(Term) :- 706 xml_is_element(Term). 707 708xml_is_content_list([]). 709xml_is_content_list([H|T]) :- 710 xml_is_content(H), 711 xml_is_content_list(T). 712 713xml_is_content(0) :- !, fail. 714xml_is_content(pi(Pi)) :- 715 !, 716 atom(Pi). 717xml_is_content(CDATA) :- 718 atom(CDATA), 719 !. 720xml_is_content(CDATA) :- 721 string(CDATA), 722 !. 723xml_is_content(Term) :- 724 xml_is_element(Term). 725 726xml_is_element(element(Name, Attributes, Content)) :- 727 dom_name(Name), 728 dom_attributes(Attributes), 729 xml_is_content_list(Content). 730 731dom_name(NS:Local) :- 732 atom(NS), 733 atom(Local), 734 !. 735dom_name(Local) :- 736 atom(Local). 737 738dom_attributes(0) :- !, fail. 739dom_attributes([]). 740dom_attributes([H|T]) :- 741 dom_attribute(H), 742 dom_attributes(T). 743 744dom_attribute(Name=Value) :- 745 dom_name(Name), 746 atomic(Value). 747 748 749 /******************************* 750 * MESSAGES * 751 *******************************/ 752:- multifile 753 prolog:message/3. 754 755% Catch messages. sgml/4 is generated by the SGML2PL binding. 756 757prologmessage(sgml(Parser, File, Line, Message)) --> 758 { get_sgml_parser(Parser, dialect(Dialect)) 759 }, 760 [ 'SGML2PL(~w): ~w:~w: ~w'-[Dialect, File, Line, Message] ]. 761 762 763 /******************************* 764 * XREF SUPPORT * 765 *******************************/ 766 767:- multifile 768 prolog:called_by/2. 769 770prologcalled_by(sgml_parse(_, Options), Called) :- 771 findall(Meta, meta_call_term(_, Meta, Options), Called). 772 773meta_call_term(T, G+N, Options) :- 774 T = call(Event, G), 775 pmember(T, Options), 776 call_params(Event, Term), 777 functor(Term, _, N). 778 779pmember(X, List) :- % member for partial lists 780 nonvar(List), 781 List = [H|T], 782 ( X = H 783 ; pmember(X, T) 784 ). 785 786call_params(begin, begin(tag,attributes,parser)). 787call_params(end, end(tag,parser)). 788call_params(cdata, cdata(cdata,parser)). 789call_params(pi, pi(cdata,parser)). 790call_params(decl, decl(cdata,parser)). 791call_params(error, error(severity,message,parser)). 792call_params(xmlns, xmlns(namespace,url,parser)). 793call_params(urlns, urlns(url,url,parser)). 794 795 /******************************* 796 * SANDBOX * 797 *******************************/ 798 799:- multifile 800 sandbox:safe_primitive/1, 801 sandbox:safe_meta_predicate/1. 802 803sandbox:safe_meta_predicate(sgml:load_structure/3). 804sandbox:safe_primitive(sgml:dtd(Dialect, _)) :- 805 dtd_alias(Dialect, _). 806sandbox:safe_primitive(sgml:xml_quote_attribute(_,_,_)). 807sandbox:safe_primitive(sgml:xml_quote_cdata(_,_,_)). 808sandbox:safe_primitive(sgml:xml_name(_,_)). 809sandbox:safe_primitive(sgml:xml_basechar(_)). 810sandbox:safe_primitive(sgml:xml_ideographic(_)). 811sandbox:safe_primitive(sgml:xml_combining_char(_)). 812sandbox:safe_primitive(sgml:xml_digit(_)). 813sandbox:safe_primitive(sgml:xml_extender(_)). 814sandbox:safe_primitive(sgml:iri_xml_namespace(_,_,_)). 815sandbox:safe_primitive(sgml:xsd_number_string(_,_)). 816sandbox:safe_primitive(sgml:xsd_time_string(_,_,_))
SGML, XML and HTML parser
This library allows you to parse SGML, XML and HTML data into a Prolog data structure. The library defines several families of predicates:
The DOM structure can be used by library(xpath) to extract information from the document.