1/* Part of SWI-Prolog 2 3 Author: Jan Wielemaker 4 E-mail: J.Wielemaker@vu.nl 5 WWW: http://www.swi-prolog.org 6 Copyright (c) 2009-2023, VU University Amsterdam 7 SWI-Prolog Solutions b.v. 8 All rights reserved. 9 10 Redistribution and use in source and binary forms, with or without 11 modification, are permitted provided that the following conditions 12 are met: 13 14 1. Redistributions of source code must retain the above copyright 15 notice, this list of conditions and the following disclaimer. 16 17 2. Redistributions in binary form must reproduce the above copyright 18 notice, this list of conditions and the following disclaimer in 19 the documentation and/or other materials provided with the 20 distribution. 21 22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 30 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 32 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 33 POSSIBILITY OF SUCH DAMAGE. 34*/ 35 36:- module(uri, 37 [ uri_components/2, % ?URI, ?Components 38 uri_data/3, % ?Field, +Components, ?Data 39 uri_data/4, % +Field, +Components, -Data, -New 40 uri_edit/3, % +Actions,+URI0,-URI 41 42 uri_normalized/2, % +URI, -NormalizedURI 43 iri_normalized/2, % +IRI, -NormalizedIRI 44 uri_normalized_iri/2, % +URI, -NormalizedIRI 45 uri_normalized/3, % +URI, +Base, -NormalizedURI 46 iri_normalized/3, % +IRI, +Base, -NormalizedIRI 47 uri_normalized_iri/3, % +URI, +Base, -NormalizedIRI 48 uri_resolve/3, % +URI, +Base, -AbsURI 49 uri_is_global/1, % +URI 50 uri_query_components/2, % ?QueryString, ?NameValueList 51 uri_authority_components/2, % ?Authority, ?Components 52 uri_authority_data/3, % ?Field, ?Components, ?Data 53 % Encoding 54 uri_encoded/3, % +Component, ?Value, ?Encoded 55 uri_file_name/2, % ?URI, ?Path 56 uri_iri/2 % ?URI, ?IRI 57 ]). 58:- autoload(library(error), [domain_error/2]). 59:- if(exists_source(library(socket))). 60:- autoload(library(socket), [gethostname/1]). 61:- endif. 62 63:- use_foreign_library(foreign(uri)). 64 65/** <module> Process URIs 66 67This library provides high-performance C-based primitives for 68manipulating URIs. We decided for a C-based implementation for the much 69better performance on raw character manipulation. Notably, URI handling 70primitives are used in time-critical parts of RDF processing. This 71implementation is based on RFC-3986: 72 73 http://labs.apache.org/webarch/uri/rfc/rfc3986.html 74 75The URI processing in this library is rather liberal. That is, we break 76URIs according to the rules, but we do not validate that the components 77are valid. Also, percent-decoding for IRIs is liberal. It first tries 78UTF-8; then ISO-Latin-1 and finally accepts %-characters verbatim. 79 80Earlier experience has shown that strict enforcement of the URI syntax 81results in many errors that are accepted by many other web-document 82processing tools. 83*/ 84 85%! uri_components(+URI, -Components) is det. 86%! uri_components(-URI, +Components) is det. 87% 88% Break a URI into its 5 basic components according to the 89% RFC-3986 regular expression: 90% 91% == 92% ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? 93% 12 3 4 5 6 7 8 9 94% == 95% 96% @param Components is a term uri_components(Scheme, Authority, 97% Path, Search, Fragment). If a URI is *parsed*, i.e., using mode 98% (+,-), components that are not found are left _uninstantiated_ 99% (variable). See uri_data/3 for accessing this structure. 100 101%! uri_data(?Field, +Components, ?Data) is semidet. 102% 103% Provide access the uri_component structure. Defined field-names 104% are: =scheme=, =authority=, =path=, =search= and =fragment= 105 106uri_data(scheme, uri_components(S, _, _, _, _), S). 107uri_data(authority, uri_components(_, A, _, _, _), A). 108uri_data(path, uri_components(_, _, P, _, _), P). 109uri_data(search, uri_components(_, _, _, S, _), S). 110uri_data(fragment, uri_components(_, _, _, _, F), F). 111 112%! uri_data(+Field, +Components, +Data, -NewComponents) is semidet. 113% 114% NewComponents is the same as Components with Field set to Data. 115 116uri_data(scheme, uri_components(_, A, P, Q, F), S, 117 uri_components(S, A, P, Q, F)). 118uri_data(authority, uri_components(S, _, P, Q, F), A, 119 uri_components(S, A, P, Q, F)). 120uri_data(path, uri_components(S, A, _, Q, F), P, 121 uri_components(S, A, P, Q, F)). 122uri_data(search, uri_components(S, A, P, _, F), Q, 123 uri_components(S, A, P, Q, F)). 124uri_data(fragment, uri_components(S, A, P, Q, _), F, 125 uri_components(S, A, P, Q, F)). 126 127%! uri_normalized(+URI, -NormalizedURI) is det. 128% 129% NormalizedURI is the normalized form of URI. Normalization is 130% syntactic and involves the following steps: 131% 132% * 6.2.2.1. Case Normalization 133% * 6.2.2.2. Percent-Encoding Normalization 134% * 6.2.2.3. Path Segment Normalization 135 136%! iri_normalized(+IRI, -NormalizedIRI) is det. 137% 138% NormalizedIRI is the normalized form of IRI. Normalization is 139% syntactic and involves the following steps: 140% 141% * 6.2.2.1. Case Normalization 142% * 6.2.2.3. Path Segment Normalization 143% 144% @see This is similar to uri_normalized/2, but does not do 145% normalization of %-escapes. 146 147%! uri_normalized_iri(+URI, -NormalizedIRI) is det. 148% 149% As uri_normalized/2, but percent-encoding is translated into IRI 150% Unicode characters. The translation is liberal: valid UTF-8 151% sequences of %-encoded bytes are mapped to the Unicode 152% character. Other %XX-sequences are mapped to the corresponding 153% ISO-Latin-1 character and sole % characters are left untouched. 154% 155% @see uri_iri/2. 156 157 158%! uri_is_global(+URI) is semidet. 159% 160% True if URI has a scheme. The semantics is the same as the code 161% below, but the implementation is more efficient as it does not need 162% to parse the other components, nor needs to bind the scheme. The 163% condition to demand a scheme of more than one character is added to 164% avoid confusion with DOS path names. 165% 166% == 167% uri_is_global(URI) :- 168% uri_components(URI, Components), 169% uri_data(scheme, Components, Scheme), 170% nonvar(Scheme), 171% atom_length(Scheme, Len), 172% Len > 1. 173% == 174 175%! uri_resolve(+URI, +Base, -GlobalURI) is det. 176% 177% Resolve a possibly local URI relative to Base. This implements 178% http://labs.apache.org/webarch/uri/rfc/rfc3986.html#relative-transform 179 180%! uri_normalized(+URI, +Base, -NormalizedGlobalURI) is det. 181% 182% NormalizedGlobalURI is the normalized global version of URI. 183% Behaves as if defined by: 184% 185% == 186% uri_normalized(URI, Base, NormalizedGlobalURI) :- 187% uri_resolve(URI, Base, GlobalURI), 188% uri_normalized(GlobalURI, NormalizedGlobalURI). 189% == 190 191%! iri_normalized(+IRI, +Base, -NormalizedGlobalIRI) is det. 192% 193% NormalizedGlobalIRI is the normalized global version of IRI. 194% This is similar to uri_normalized/3, but does not do %-escape 195% normalization. 196 197%! uri_normalized_iri(+URI, +Base, -NormalizedGlobalIRI) is det. 198% 199% NormalizedGlobalIRI is the normalized global IRI of URI. Behaves 200% as if defined by: 201% 202% == 203% uri_normalized(URI, Base, NormalizedGlobalIRI) :- 204% uri_resolve(URI, Base, GlobalURI), 205% uri_normalized_iri(GlobalURI, NormalizedGlobalIRI). 206% == 207 208%! uri_query_components(+String, -Query) is det. 209%! uri_query_components(-String, +Query) is det. 210% 211% Perform encoding and decoding of an URI query string. Query is a 212% list of fully decoded (Unicode) Name=Value pairs. In mode (-,+), 213% query elements of the forms Name(Value) and Name-Value are also 214% accepted to enhance interoperability with the option and pairs 215% libraries. E.g. 216% 217% == 218% ?- uri_query_components(QS, [a=b, c('d+w'), n-'VU Amsterdam']). 219% QS = 'a=b&c=d%2Bw&n=VU%20Amsterdam'. 220% 221% ?- uri_query_components('a=b&c=d%2Bw&n=VU%20Amsterdam', Q). 222% Q = [a=b, c='d+w', n='VU Amsterdam']. 223% == 224 225 226%! uri_authority_components(+Authority, -Components) is det. 227%! uri_authority_components(-Authority, +Components) is det. 228% 229% Break-down the authority component of a URI. The fields of the 230% structure Components can be accessed using uri_authority_data/3. 231% This predicate deals with IPv6 addresses written as ``[ip]``, 232% returning the _ip_ as `host`, without the enclosing `[]`. When 233% constructing an authority string and the host contains `:`, the 234% host is embraced in `[]`. If `[]` is not used correctly, the 235% behavior should be considered poorly defined. If there is no 236% balancing `]` or the host part does not end with `]`, these 237% characters are considered normal characters and part of the 238% (invalid) host name. 239 240 241%! uri_authority_data(+Field, ?Components, ?Data) is semidet. 242% 243% Provide access the uri_authority structure. Defined field-names 244% are: =user=, =password=, =host= and =port= 245 uri_authority(U, _, _, _), U) (user, . 247uri_authority_data(password, uri_authority(_, P, _, _), P). 248uri_authority_data(host, uri_authority(_, _, H, _), H). 249uri_authority_data(port, uri_authority(_, _, _, P), P). 250 251 252%! uri_encoded(+Component, +Value, -Encoded) is det. 253%! uri_encoded(+Component, -Value, +Encoded) is det. 254% 255% Encoded is the URI encoding for Value. When encoding 256% (Value->Encoded), Component specifies the URI component where the 257% value is used. It is one of =query_value=, =fragment=, =path= or 258% =segment=. Besides alphanumerical characters, the following 259% characters are passed verbatim (the set is split in logical groups 260% according to RFC3986). 261% 262% $ query_value, fragment : 263% "-._~" | "!$'()*,;" | "@" | "/?" 264% $ path : 265% "-._~" | "!$&'()*,;=" | "@" | "/" 266% $ segment : 267% "-._~" | "!$&'()*,;=" | "@" 268 269%! uri_iri(+URI, -IRI) is det. 270%! uri_iri(-URI, +IRI) is det. 271% 272% Convert between a URI, encoded in US-ASCII and an IRI. An IRI is 273% a fully expanded Unicode string. Unicode strings are first 274% encoded into UTF-8, after which %-encoding takes place. 275% 276% @error syntax_error(Culprit) in mode (+,-) if URI is not a 277% legally percent-encoded UTF-8 string. 278 279 280%! uri_file_name(+URI, -FileName) is semidet. 281%! uri_file_name(-URI, +FileName) is det. 282% 283% Convert between a URI and a local file_name. This protocol is 284% covered by RFC 1738. Please note that file-URIs use _absolute_ 285% paths. The mode (-, +) translates a possible relative path into 286% an absolute one. 287 288uri_file_name(URI, FileName) :- 289 nonvar(URI), 290 !, 291 uri_components(URI, Components), 292 uri_data(scheme, Components, File), File == file, 293 uri_data(authority, Components, Host), 294 my_host(Host), 295 uri_data(path, Components, FileNameEnc), 296 uri_encoded(path, FileName0, FileNameEnc), 297 delete_leading_slash(FileName0, FileName). 298uri_file_name(URI, FileName) :- 299 nonvar(FileName), 300 !, 301 absolute_file_name(FileName, Path0), 302 ensure_leading_slash(Path0, Path), 303 uri_encoded(path, Path, PathEnc), 304 uri_data(scheme, Components, file), 305 uri_data(authority, Components, ''), 306 uri_data(path, Components, PathEnc), 307 uri_components(URI, Components). 308 309my_host('') :- !. 310my_host(localhost) :- !. 311:- if(exists_source(library(socket))). 312my_host(Host) :- 313 gethostname(Host). 314:- endif. 315 316%! ensure_leading_slash(+WinPath, -Path). 317%! delete_leading_slash(+Path, -WinPath). 318% 319% Deal with the fact that absolute paths in Windows start with a 320% drive letter rather than a /. For URIs we need a path that 321% starts with a /. 322 323ensure_leading_slash(Path, SlashPath) :- 324 ( sub_atom(Path, 0, _, _, /) 325 -> SlashPath = Path 326 ; atom_concat(/, Path, SlashPath) 327 ). 328 329:- if(current_prolog_flag(windows, true)). 330delete_leading_slash(Path, WinPath) :- 331 atom_concat(/, WinPath, Path), 332 is_absolute_file_name(WinPath), 333 !. 334:- endif. 335delete_leading_slash(Path, Path). 336 337 338 /******************************* 339 * MODIFYING * 340 *******************************/ 341 342%! uri_edit(+Actions, +URI0, -URI) is det. 343% 344% Modify a URI according to Actions. Actions is either a single 345% action or a (nested) list of actions. Defined primitive actions 346% are: 347% 348% - scheme(+Scheme) 349% Set the Scheme of the URI (typically `http`, `https`, etc.) 350% - user(+User) 351% Add/set the user of the authority component. 352% - password(+Password) 353% Add/set the password of the authority component. 354% - host(+Host) 355% Add/set the host (or ip address) of the authority component. 356% - port(+Port) 357% Add/set the port of the authority component. 358% - path(+Path) 359% Set/extend the `path` component. If Path is not absolute it 360% is taken relative to the path of URI0. 361% - search(+KeyValues) 362% Extend the `Key=Value` pairs of the current search (query) 363% component. New values replace existing values. If KeyValues 364% is written as =(KeyValues) the current search component is 365% ignored. KeyValues is a list, whose elements are one of 366% `Key=Value`, `Key-Value` or `Key(Value)`. 367% - fragment(+Fragment) 368% Set the Fragment of the uri. 369% 370% Components can be _removed_ by using a variable as value, except 371% from `path` which can be reset using path(/) and query which can 372% be dropped using query(=([])). 373% 374% @arg URI0 is either a valid uri or a variable to start fresh. 375 376uri_edit(Actions, URI0, URI) :- 377 ( var(URI0) 378 -> URI1 = '/' 379 ; URI1 = URI0 380 ), 381 uri_components(URI1, Comp0), 382 edit_components(Actions, Comp0, Comp), 383 uri_components(URI, Comp). 384 385edit_components([], Comp0, Comp) => 386 Comp = Comp0. 387edit_components([H|T], Comp0, Comp) => 388 edit_components(H, Comp0, Comp1), 389 edit_components(T, Comp1, Comp). 390edit_components(scheme(Scheme), Comp0, Comp) => 391 uri_data(scheme, Comp0, Scheme, Comp). 392edit_components(path(Path), Comp0, Comp) => 393 uri_data(path, Comp0, Path0), 394 ( ( var(Path0) 395 ; Path0 == '' 396 ) 397 -> Path1 = '/' 398 ; Path1 = Path0 399 ), 400 uri_normalized(Path, Path1, Path2), 401 uri_data(path, Comp0, Path2, Comp). 402edit_components(fragment(Fragment), Comp0, Comp) => 403 uri_data(fragment, Comp0, Fragment, Comp). 404edit_components(Authority, Comp0, Comp), 405 authority_field(Authority) => 406 uri_data(authority, Comp0, Auth0), 407 ( var(Auth0) 408 -> true 409 ; uri_authority_components(Auth0, AComp0) 410 ), 411 edit_auth_components(Authority, AComp0, AComp), 412 uri_authority_components(Auth, AComp), 413 uri_data(authority, Comp0, Auth, Comp). 414edit_components(query(Search), Comp0, Comp) => 415 edit_components(search(Search), Comp0, Comp). 416edit_components(search(=(Search)), Comp0, Comp) => 417 uri_query_components(String, Search), 418 uri_data(search, Comp0, String, Comp). 419edit_components(search(Search), Comp0, Comp) => 420 uri_data(search, Comp0, SS0), 421 ( var(SS0) 422 -> Search0 = [] 423 ; uri_query_components(SS0, Search0) 424 ), 425 join_search(Search0, Search, Search1), 426 uri_query_components(SS1, Search1), 427 uri_data(search, Comp0, SS1, Comp). 428edit_components(Other, _, _) => 429 domain_error(uri_edit, Other). 430 user(_)) (. 432authority_field(password(_)). 433authority_field(host(_)). 434authority_field(port(_)). 435 436edit_auth_components(user(User), 437 uri_authority(_, Passwd, Host, Port), 438 uri_authority(User, Passwd, Host, Port)). 439edit_auth_components(password(Passwd), 440 uri_authority(User, _, Host, Port), 441 uri_authority(User, Passwd, Host, Port)). 442edit_auth_components(host(Host), 443 uri_authority(User, Passwd, _, Port), 444 uri_authority(User, Passwd, Host, Port)). 445edit_auth_components(port(Port), 446 uri_authority(User, Passwd, Host, _), 447 uri_authority(User, Passwd, Host, Port)). 448 449join_search([], Search, Search). 450join_search([N=_|ST], New, Search) :- 451 ( memberchk(N=_, New) 452 -> true 453 ; functor(T, N, 1), 454 memberchk(T, New) 455 -> true 456 ; memberchk(N-_, New) 457 ), 458 !, 459 join_search(ST, New, Search). 460join_search([H|ST], New, [H|Search]) :- 461 join_search(ST, New, Search). 462 463 464 /******************************* 465 * SANDBOX * 466 *******************************/ 467 468:- multifile sandbox:safe_primitive/1. 469 470sandbox:safe_primitive(uri:uri_components(_,_)). 471sandbox:safe_primitive(uri:uri_normalized(_,_)). 472sandbox:safe_primitive(uri:iri_normalized(_,_)). 473sandbox:safe_primitive(uri:uri_normalized_iri(_,_)). 474sandbox:safe_primitive(uri:uri_normalized(_,_,_)). 475sandbox:safe_primitive(uri:iri_normalized(_,_,_)). 476sandbox:safe_primitive(uri:uri_normalized_iri(_,_,_)). 477sandbox:safe_primitive(uri:uri_resolve(_,_,_)). 478sandbox:safe_primitive(uri:uri_is_global(_)). 479sandbox:safe_primitive(uri:uri_query_components(_,_)). 480sandbox:safe_primitive(uri:uri_authority_components(_,_)). 481sandbox:safe_primitive(uri:uri_encoded(_,_,_)). 482sandbox:safe_primitive(uri:uri_iri(_,_))