View source with formatted comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2009-2023, VU University Amsterdam
    7			      SWI-Prolog Solutions b.v.
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(uri,
   37          [ uri_components/2,           % ?URI, ?Components
   38            uri_data/3,                 % ?Field, +Components, ?Data
   39            uri_data/4,                 % +Field, +Components, -Data, -New
   40	    uri_edit/3,			% +Actions,+URI0,-URI
   41
   42            uri_normalized/2,           % +URI, -NormalizedURI
   43            iri_normalized/2,           % +IRI, -NormalizedIRI
   44            uri_normalized_iri/2,       % +URI, -NormalizedIRI
   45            uri_normalized/3,           % +URI, +Base, -NormalizedURI
   46            iri_normalized/3,           % +IRI, +Base, -NormalizedIRI
   47            uri_normalized_iri/3,       % +URI, +Base, -NormalizedIRI
   48            uri_resolve/3,              % +URI, +Base, -AbsURI
   49            uri_is_global/1,            % +URI
   50            uri_query_components/2,     % ?QueryString, ?NameValueList
   51            uri_authority_components/2, % ?Authority, ?Components
   52            uri_authority_data/3,       % ?Field, ?Components, ?Data
   53					% Encoding
   54            uri_encoded/3,              % +Component, ?Value, ?Encoded
   55            uri_file_name/2,            % ?URI, ?Path
   56            uri_iri/2                   % ?URI, ?IRI
   57	  ]).   58:- autoload(library(error), [domain_error/2]).   59:- if(exists_source(library(socket))).   60:- autoload(library(socket), [gethostname/1]).   61:- endif.   62
   63:- use_foreign_library(foreign(uri)).   64
   65/** <module> Process URIs
   66
   67This  library  provides   high-performance    C-based   primitives   for
   68manipulating URIs. We decided for a  C-based implementation for the much
   69better performance on raw character  manipulation. Notably, URI handling
   70primitives are used in  time-critical  parts   of  RDF  processing. This
   71implementation is based on RFC-3986:
   72
   73        http://labs.apache.org/webarch/uri/rfc/rfc3986.html
   74
   75The URI processing in this library is  rather liberal. That is, we break
   76URIs according to the rules, but we  do not validate that the components
   77are valid. Also, percent-decoding for IRIs   is  liberal. It first tries
   78UTF-8; then ISO-Latin-1 and finally accepts %-characters verbatim.
   79
   80Earlier experience has shown that strict   enforcement of the URI syntax
   81results in many errors that  are   accepted  by  many other web-document
   82processing tools.
   83*/
   84
   85%!  uri_components(+URI, -Components) is det.
   86%!  uri_components(-URI, +Components) is det.
   87%
   88%   Break a URI  into  its  5   basic  components  according  to the
   89%   RFC-3986 regular expression:
   90%
   91%       ==
   92%       ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
   93%        12            3  4          5       6  7        8 9
   94%       ==
   95%
   96%   @param Components is a   term  uri_components(Scheme, Authority,
   97%   Path, Search, Fragment). If a URI  is *parsed*, i.e., using mode
   98%   (+,-), components that are not   found are left _uninstantiated_
   99%   (variable). See uri_data/3 for accessing this structure.
  100
  101%!  uri_data(?Field, +Components, ?Data) is semidet.
  102%
  103%   Provide access the uri_component structure.  Defined field-names
  104%   are: =scheme=, =authority=, =path=, =search= and =fragment=
  105
  106uri_data(scheme,    uri_components(S, _, _, _, _), S).
  107uri_data(authority, uri_components(_, A, _, _, _), A).
  108uri_data(path,      uri_components(_, _, P, _, _), P).
  109uri_data(search,    uri_components(_, _, _, S, _), S).
  110uri_data(fragment,  uri_components(_, _, _, _, F), F).
  111
  112%!  uri_data(+Field, +Components, +Data, -NewComponents) is semidet.
  113%
  114%   NewComponents is the same as Components with Field set to Data.
  115
  116uri_data(scheme,    uri_components(_, A, P, Q, F), S,
  117                    uri_components(S, A, P, Q, F)).
  118uri_data(authority, uri_components(S, _, P, Q, F), A,
  119                    uri_components(S, A, P, Q, F)).
  120uri_data(path,      uri_components(S, A, _, Q, F), P,
  121                    uri_components(S, A, P, Q, F)).
  122uri_data(search,    uri_components(S, A, P, _, F), Q,
  123                    uri_components(S, A, P, Q, F)).
  124uri_data(fragment,  uri_components(S, A, P, Q, _), F,
  125                    uri_components(S, A, P, Q, F)).
  126
  127%!  uri_normalized(+URI, -NormalizedURI) is det.
  128%
  129%   NormalizedURI is the normalized form   of  URI. Normalization is
  130%   syntactic and involves the following steps:
  131%
  132%       * 6.2.2.1. Case Normalization
  133%       * 6.2.2.2. Percent-Encoding Normalization
  134%       * 6.2.2.3. Path Segment Normalization
  135
  136%!  iri_normalized(+IRI, -NormalizedIRI) is det.
  137%
  138%   NormalizedIRI is the normalized form   of  IRI. Normalization is
  139%   syntactic and involves the following steps:
  140%
  141%       * 6.2.2.1. Case Normalization
  142%       * 6.2.2.3. Path Segment Normalization
  143%
  144%   @see    This is similar to uri_normalized/2, but does not do
  145%           normalization of %-escapes.
  146
  147%!  uri_normalized_iri(+URI, -NormalizedIRI) is det.
  148%
  149%   As uri_normalized/2, but percent-encoding is translated into IRI
  150%   Unicode characters. The translation  is   liberal:  valid  UTF-8
  151%   sequences  of  %-encoded  bytes  are    mapped  to  the  Unicode
  152%   character. Other %XX-sequences are mapped   to the corresponding
  153%   ISO-Latin-1 character and sole % characters are left untouched.
  154%
  155%   @see uri_iri/2.
  156
  157
  158%!  uri_is_global(+URI) is semidet.
  159%
  160%   True if URI has a scheme. The  semantics   is  the  same as the code
  161%   below, but the implementation is more efficient  as it does not need
  162%   to parse the other components, nor  needs   to  bind the scheme. The
  163%   condition to demand a scheme of more  than one character is added to
  164%   avoid confusion with DOS path names.
  165%
  166%   ==
  167%   uri_is_global(URI) :-
  168%           uri_components(URI, Components),
  169%           uri_data(scheme, Components, Scheme),
  170%           nonvar(Scheme),
  171%           atom_length(Scheme, Len),
  172%           Len > 1.
  173%   ==
  174
  175%!  uri_resolve(+URI, +Base, -GlobalURI) is det.
  176%
  177%   Resolve a possibly local URI relative   to Base. This implements
  178%   http://labs.apache.org/webarch/uri/rfc/rfc3986.html#relative-transform
  179
  180%!  uri_normalized(+URI, +Base, -NormalizedGlobalURI) is det.
  181%
  182%   NormalizedGlobalURI is the normalized global version of URI.
  183%   Behaves as if defined by:
  184%
  185%   ==
  186%   uri_normalized(URI, Base, NormalizedGlobalURI) :-
  187%           uri_resolve(URI, Base, GlobalURI),
  188%           uri_normalized(GlobalURI, NormalizedGlobalURI).
  189%   ==
  190
  191%!  iri_normalized(+IRI, +Base, -NormalizedGlobalIRI) is det.
  192%
  193%   NormalizedGlobalIRI is the normalized  global   version  of IRI.
  194%   This is similar to uri_normalized/3, but   does  not do %-escape
  195%   normalization.
  196
  197%!  uri_normalized_iri(+URI, +Base, -NormalizedGlobalIRI) is det.
  198%
  199%   NormalizedGlobalIRI is the normalized global IRI of URI. Behaves
  200%   as if defined by:
  201%
  202%   ==
  203%   uri_normalized(URI, Base, NormalizedGlobalIRI) :-
  204%           uri_resolve(URI, Base, GlobalURI),
  205%           uri_normalized_iri(GlobalURI, NormalizedGlobalIRI).
  206%   ==
  207
  208%!  uri_query_components(+String, -Query) is det.
  209%!  uri_query_components(-String, +Query) is det.
  210%
  211%   Perform encoding and decoding of an URI query string. Query is a
  212%   list of fully decoded (Unicode) Name=Value pairs. In mode (-,+),
  213%   query elements of the forms Name(Value)  and Name-Value are also
  214%   accepted to enhance interoperability with   the option and pairs
  215%   libraries.  E.g.
  216%
  217%   ==
  218%   ?- uri_query_components(QS, [a=b, c('d+w'), n-'VU Amsterdam']).
  219%   QS = 'a=b&c=d%2Bw&n=VU%20Amsterdam'.
  220%
  221%   ?- uri_query_components('a=b&c=d%2Bw&n=VU%20Amsterdam', Q).
  222%   Q = [a=b, c='d+w', n='VU Amsterdam'].
  223%   ==
  224
  225
  226%!  uri_authority_components(+Authority, -Components) is det.
  227%!  uri_authority_components(-Authority, +Components) is det.
  228%
  229%   Break-down the  authority component of  a URI.  The fields  of the
  230%   structure Components  can be accessed  using uri_authority_data/3.
  231%   This  predicate deals  with  IPv6 addresses  written as  ``[ip]``,
  232%   returning the  _ip_ as `host`,  without the enclosing  `[]`.  When
  233%   constructing an  authority string and  the host contains  `:`, the
  234%   host is  embraced in  `[]`.  If  `[]` is  not used  correctly, the
  235%   behavior  should be  considered poorly  defined.  If  there is  no
  236%   balancing  `]` or  the  host part  does not  end  with `]`,  these
  237%   characters  are  considered  normal  characters and  part  of  the
  238%   (invalid) host name.
  239
  240
  241%!  uri_authority_data(+Field, ?Components, ?Data) is semidet.
  242%
  243%   Provide access the uri_authority  structure. Defined field-names
  244%   are: =user=, =password=, =host= and =port=
  245
  246uri_authority_data(user,     uri_authority(U, _, _, _), U).
  247uri_authority_data(password, uri_authority(_, P, _, _), P).
  248uri_authority_data(host,     uri_authority(_, _, H, _), H).
  249uri_authority_data(port,     uri_authority(_, _, _, P), P).
  250
  251
  252%!  uri_encoded(+Component, +Value, -Encoded) is det.
  253%!  uri_encoded(+Component, -Value, +Encoded) is det.
  254%
  255%   Encoded   is   the   URI   encoding   for   Value.   When   encoding
  256%   (Value->Encoded), Component specifies the URI   component  where the
  257%   value is used. It is  one   of  =query_value=, =fragment=, =path= or
  258%   =segment=.  Besides  alphanumerical   characters,    the   following
  259%   characters are passed verbatim (the set   is split in logical groups
  260%   according to RFC3986).
  261%
  262%       $ query_value, fragment :
  263%       "-._~" | "!$'()*,;" | "@" | "/?"
  264%       $ path :
  265%       "-._~" | "!$&'()*,;=" | "@" | "/"
  266%       $ segment :
  267%       "-._~" | "!$&'()*,;=" | "@"
  268
  269%!  uri_iri(+URI, -IRI) is det.
  270%!  uri_iri(-URI, +IRI) is det.
  271%
  272%   Convert between a URI, encoded in US-ASCII and an IRI. An IRI is
  273%   a fully expanded  Unicode  string.   Unicode  strings  are first
  274%   encoded into UTF-8, after which %-encoding takes place.
  275%
  276%   @error syntax_error(Culprit) in mode (+,-) if URI is not a
  277%   legally percent-encoded UTF-8 string.
  278
  279
  280%!  uri_file_name(+URI, -FileName) is semidet.
  281%!  uri_file_name(-URI, +FileName) is det.
  282%
  283%   Convert between a URI and a   local  file_name. This protocol is
  284%   covered by RFC 1738. Please note   that file-URIs use _absolute_
  285%   paths. The mode (-, +) translates  a possible relative path into
  286%   an absolute one.
  287
  288uri_file_name(URI, FileName) :-
  289    nonvar(URI),
  290    !,
  291    uri_components(URI, Components),
  292    uri_data(scheme, Components, File), File == file,
  293    uri_data(authority, Components, Host),
  294    my_host(Host),
  295    uri_data(path, Components, FileNameEnc),
  296    uri_encoded(path, FileName0, FileNameEnc),
  297    delete_leading_slash(FileName0, FileName).
  298uri_file_name(URI, FileName) :-
  299    nonvar(FileName),
  300    !,
  301    absolute_file_name(FileName, Path0),
  302    ensure_leading_slash(Path0, Path),
  303    uri_encoded(path, Path, PathEnc),
  304    uri_data(scheme, Components, file),
  305    uri_data(authority, Components, ''),
  306    uri_data(path, Components, PathEnc),
  307    uri_components(URI, Components).
  308
  309my_host('') :- !.
  310my_host(localhost) :- !.
  311:- if(exists_source(library(socket))).  312my_host(Host) :-
  313    gethostname(Host).
  314:- endif.  315
  316%!  ensure_leading_slash(+WinPath, -Path).
  317%!  delete_leading_slash(+Path, -WinPath).
  318%
  319%   Deal with the fact that absolute paths   in Windows start with a
  320%   drive letter rather than a  /.  For   URIs  we  need a path that
  321%   starts with a /.
  322
  323ensure_leading_slash(Path, SlashPath) :-
  324    (   sub_atom(Path, 0, _, _, /)
  325    ->  SlashPath = Path
  326    ;   atom_concat(/, Path, SlashPath)
  327    ).
  328
  329:- if(current_prolog_flag(windows, true)).  330delete_leading_slash(Path, WinPath) :-
  331    atom_concat(/, WinPath, Path),
  332    is_absolute_file_name(WinPath),
  333    !.
  334:- endif.  335delete_leading_slash(Path, Path).
  336
  337
  338		 /*******************************
  339		 *          MODIFYING           *
  340		 *******************************/
  341
  342%!  uri_edit(+Actions, +URI0, -URI) is det.
  343%
  344%   Modify a  URI according  to Actions.  Actions  is either  a single
  345%   action or a  (nested) list of actions.   Defined primitive actions
  346%   are:
  347%
  348%     - scheme(+Scheme)
  349%       Set the Scheme of the URI (typically `http`, `https`, etc.)
  350%     - user(+User)
  351%       Add/set the user of the authority component.
  352%     - password(+Password)
  353%       Add/set the password of the authority component.
  354%     - host(+Host)
  355%       Add/set the host (or ip address) of the authority component.
  356%     - port(+Port)
  357%       Add/set the port of the authority component.
  358%     - path(+Path)
  359%       Set/extend the `path` component.  If Path is not absolute it
  360%       is taken relative to the path of URI0.
  361%     - search(+KeyValues)
  362%       Extend the `Key=Value` pairs of the current search (query)
  363%       component.   New values replace existing values.  If KeyValues
  364%       is written as =(KeyValues) the current search component is
  365%       ignored.  KeyValues is a list, whose elements are one of
  366%       `Key=Value`, `Key-Value` or `Key(Value)`.
  367%     - fragment(+Fragment)
  368%       Set the Fragment of the uri.
  369%
  370%   Components can be  _removed_ by using a variable  as value, except
  371%   from `path` which  can be reset using path(/) and  query which can
  372%   be dropped using query(=([])).
  373%
  374%   @arg URI0 is either a valid uri or a variable to start fresh.
  375
  376uri_edit(Actions, URI0, URI) :-
  377    (   var(URI0)
  378    ->  URI1 = '/'
  379    ;   URI1 = URI0
  380    ),
  381    uri_components(URI1, Comp0),
  382    edit_components(Actions, Comp0, Comp),
  383    uri_components(URI, Comp).
  384
  385edit_components([], Comp0, Comp) =>
  386    Comp = Comp0.
  387edit_components([H|T], Comp0, Comp) =>
  388    edit_components(H, Comp0, Comp1),
  389    edit_components(T, Comp1, Comp).
  390edit_components(scheme(Scheme), Comp0, Comp) =>
  391    uri_data(scheme, Comp0, Scheme, Comp).
  392edit_components(path(Path), Comp0, Comp) =>
  393    uri_data(path, Comp0, Path0),
  394    (   (   var(Path0)
  395        ;   Path0 == ''
  396        )
  397    ->  Path1 = '/'
  398    ;   Path1 = Path0
  399    ),
  400    uri_normalized(Path, Path1, Path2),
  401    uri_data(path, Comp0, Path2, Comp).
  402edit_components(fragment(Fragment), Comp0, Comp) =>
  403    uri_data(fragment, Comp0, Fragment, Comp).
  404edit_components(Authority, Comp0, Comp),
  405  authority_field(Authority) =>
  406    uri_data(authority, Comp0, Auth0),
  407    (   var(Auth0)
  408    ->  true
  409    ;   uri_authority_components(Auth0, AComp0)
  410    ),
  411    edit_auth_components(Authority, AComp0, AComp),
  412    uri_authority_components(Auth, AComp),
  413    uri_data(authority, Comp0, Auth, Comp).
  414edit_components(query(Search), Comp0, Comp) =>
  415    edit_components(search(Search), Comp0, Comp).
  416edit_components(search(=(Search)), Comp0, Comp) =>
  417    uri_query_components(String, Search),
  418    uri_data(search, Comp0, String, Comp).
  419edit_components(search(Search), Comp0, Comp) =>
  420    uri_data(search, Comp0, SS0),
  421    (   var(SS0)
  422    ->  Search0 = []
  423    ;   uri_query_components(SS0, Search0)
  424    ),
  425    join_search(Search0, Search, Search1),
  426    uri_query_components(SS1, Search1),
  427    uri_data(search, Comp0, SS1, Comp).
  428edit_components(Other, _, _) =>
  429    domain_error(uri_edit, Other).
  430
  431authority_field(user(_)).
  432authority_field(password(_)).
  433authority_field(host(_)).
  434authority_field(port(_)).
  435
  436edit_auth_components(user(User),
  437		     uri_authority(_, Passwd, Host, Port),
  438		     uri_authority(User, Passwd, Host, Port)).
  439edit_auth_components(password(Passwd),
  440		     uri_authority(User, _, Host, Port),
  441		     uri_authority(User, Passwd, Host, Port)).
  442edit_auth_components(host(Host),
  443		     uri_authority(User, Passwd, _, Port),
  444		     uri_authority(User, Passwd, Host, Port)).
  445edit_auth_components(port(Port),
  446		     uri_authority(User, Passwd, Host, _),
  447		     uri_authority(User, Passwd, Host, Port)).
  448
  449join_search([], Search, Search).
  450join_search([N=_|ST], New, Search) :-
  451    (   memberchk(N=_, New)
  452    ->  true
  453    ;   functor(T, N, 1),
  454	memberchk(T, New)
  455    ->  true
  456    ;   memberchk(N-_, New)
  457    ),
  458    !,
  459    join_search(ST, New, Search).
  460join_search([H|ST], New, [H|Search]) :-
  461    join_search(ST, New, Search).
  462
  463
  464                 /*******************************
  465                 *            SANDBOX           *
  466                 *******************************/
  467
  468:- multifile sandbox:safe_primitive/1.  469
  470sandbox:safe_primitive(uri:uri_components(_,_)).
  471sandbox:safe_primitive(uri:uri_normalized(_,_)).
  472sandbox:safe_primitive(uri:iri_normalized(_,_)).
  473sandbox:safe_primitive(uri:uri_normalized_iri(_,_)).
  474sandbox:safe_primitive(uri:uri_normalized(_,_,_)).
  475sandbox:safe_primitive(uri:iri_normalized(_,_,_)).
  476sandbox:safe_primitive(uri:uri_normalized_iri(_,_,_)).
  477sandbox:safe_primitive(uri:uri_resolve(_,_,_)).
  478sandbox:safe_primitive(uri:uri_is_global(_)).
  479sandbox:safe_primitive(uri:uri_query_components(_,_)).
  480sandbox:safe_primitive(uri:uri_authority_components(_,_)).
  481sandbox:safe_primitive(uri:uri_encoded(_,_,_)).
  482sandbox:safe_primitive(uri:uri_iri(_,_))