1:- encoding(utf8).
    2:- module(
    3  rdf_clean,
    4  [
    5    rdf_clean_quad/3,   % +BaseIri, +SwiQuad, -Quad
    6    rdf_clean_triple/3, % +BaseIri, +SwiTriple, -Triple
    7    rdf_clean_tuple/3   % +BaseIri, +SwiTuple, -Tuple
    8  ]
    9).

RDF cleaning

   15:- use_module(library(semweb/rdf11), []).   16
   17:- use_module(library(hash_ext)).   18:- use_module(library(rdf_prefix)).   19:- use_module(library(rdf_term)).   20:- use_module(library(uri_ext)).   21
   22:- rdf_meta
   23   rdf_clean_quad(+, t, -),   rdf_clean_triple(+, t, -),   rdf_clean_tuple(+, t, -),   rdf_clean_lexical_form(r, +, -).

rdf_clean_bnode(+BaseIri:iri, +BNode:rdf_bnode, -Iri:iri) is det: Blank node cleaning results in Skolemization / a well-known IRI.
BNodePrefix must uniquely denote the document scope in which the blank node occurs. For this we use the BaseIri argument.

   39rdf_clean_bnode(BaseIri, BNode, Iri) :-
   40  % The SWI-Prolog RDF parsers create long blank node labels that do
   41  % not conform to serialization grammars (e.g.,
   42  % â_:http://www.gutenberg.org/feeds/catalog.rdf.bz2#_:Description2â).
   43  % We use MD5 hashes to (1) at least limit the maximum length a blank
   44  % node label can have, (2) ensure that the blank node label does not
   45  % violate serialization grammars, while (3) retaining the feature
   46  % that the same blank node in the source document receives the same
   47  % Skolemized well-known IRI.
   48  md5(BaseIri-BNode, Hash),
   49  well_known_iri([Hash], Iri).

rdf_clean_graph(+GraphName:rdf_graph_name, -CleanGraphName:rdf_graph_name) is semidet

   56rdf_clean_graph(G1, G3) :-
   57  rdf11:post_graph(G2, G1),
   58  (   G2 == user
   59  ->  rdf11:rdf_default_graph(G3)
   60  ;   rdf11:rdf_default_graph(G2)
   61  ->  G3 = G2
   62  ;   rdf_clean_iri(G2, G3)
   63  ).

rdf_clean_iri(+Iri:atom, -CleanIri:rdf_iri) is semidet

IRIs are assumed to have been made absolute by the RDF parser prior to cleaning (through option `base/1' or `base_iri/1'). If this is not the case, then perform the following prior to cleaning:

rdf_base_iri(BaseIri),
uri_resolve(Iri1, BaseIri, Iri2).

To be done: - There is no implementation for the IRI grammar yet, so we use a conversion from IRIs to URIs, together with an implementation of the URI grammar.

   82rdf_clean_iri(Iri, Iri) :-
   83  atom(Iri).

rdf_clean_lexical_form(+D:rdf_iri, +Lex:atom, -CleanLex:atom) is det

   89% language-tagged string
   90rdf_clean_lexical_form(rdf:langString, Lex, _) :- !,
   91  throw(error(rdf_error(missing_language_tag,Lex),rdf_clean_lexical_form/3)).
   92% typed literal
   93rdf_clean_lexical_form(D, Lex1, Lex2) :-
   94  rdf_lexical_value(D, Lex1, Value),
   95  rdf_lexical_value(D, Lex2, Value),
   96  % Emit a warning if the lexical form is not canonical.
   97  (   Lex1 \== Lex2
   98  ->  print_message(
   99        warning,
  100        error(
  101          rdf_error(non_canonical_lexical_form,D,Lex1,Lex2),
  102          rdf_clean_lexical_form/3
  103        )
  104      )
  105  ;   true
  106  ).

rdf_clean_literal(+Literal:rdf_literal, -CleanLiteral:rdf_literal) is det

  112% language-tagged string (rdf:langString)
  113rdf_clean_literal(literal(lang(LTag1,Lex)), literal(lang(LTag2,Lex))) :- !,
  114  downcase_atom(LTag1, LTag2),
  115  % Emit a warning if the language tag is not canonical.
  116  (   LTag1 \== LTag2
  117  ->  print_message(
  118        warning,
  119        error(
  120          rdf_error(non_canonical_language_tag,LTag1),
  121          rdf_clean_literal/2
  122        )
  123      )
  124  ;   true
  125  ).
  126% typed literal
  127rdf_clean_literal(literal(type(D1,Lex1)), literal(type(D2,Lex2))) :- !,
  128  rdf_clean_iri(D1, D2),
  129  rdf_clean_lexical_form(D2, Lex1, Lex2).
  130% simple literal (RDF 1.0): quickly clean this to a typed literal (RDF 1.1).
  131rdf_clean_literal(literal(Lex), literal(type(D,Lex))) :-
  132  rdf_equal(xsd:string, D).

rdf_clean_node(+BaseIri:iri, +Node:rdf_node, -CleanNode:rdf_node) is det

  138rdf_clean_node(BaseIri, Node1, Node2) :-
  139  rdf_clean_nonliteral(BaseIri, Node1, Node2), !.
  140rdf_clean_node(_, Literal1, Literal2) :-
  141  rdf_clean_literal(Literal1, Literal2).

rdf_clean_nonliteral(+BaseIri:iri, +NonLiteral:or([rdf_bnode,rdf_iri]), -CleanNonLiteral:or([rdf_bnode,rdf_iri])) is semidet

  149% blank node
  150rdf_clean_nonliteral(BaseIri, BNode, Iri) :-
  151  rdf_is_bnode(BNode), !,
  152  rdf_clean_bnode(BaseIri, BNode, Iri).
  153% IRI
  154rdf_clean_nonliteral(_, Iri1, Iri2) :-
  155  rdf_is_iri(Iri1), !,
  156  rdf_clean_iri(Iri1, Iri2).

rdf_clean_quad(+BaseIri:iri, +SwiQuad, -Quad:rdf_quad) is semidet

  162rdf_clean_quad(BaseIri, rdf(S1,P1,O1,G1), tp(S2,P2,O2,G2)) :-
  163  rdf_clean_triple(BaseIri, rdf(S1,P1,O1), tp(S2,P2,O2)),
  164  rdf_clean_graph(G1, G2).

rdf_clean_triple(+BaseIri:iri, +SwiTriple, -Triple:rdf_triple) is semidet

  170rdf_clean_triple(BaseIri, rdf(S1,P1,O1), tp(S2,P2,O2)) :-
  171  rdf_clean_nonliteral(BaseIri, S1, S2),
  172  rdf_clean_iri(P1, P2),
  173  rdf_clean_node(BaseIri, O1, O2).

rdf_clean_tuple(+BaseIri:iri, +SwiTuple, -Tuple:rdf_tuple) is semidet

  179% triple
  180rdf_clean_tuple(BaseIri, rdf(S,P,O), Triple) :- !,
  181  rdf_clean_triple(BaseIri, rdf(S,P,O), Triple).
  182% quadruple
  183rdf_clean_tuple(BaseIri, Quad, CleanQuad) :-
  184  rdf_clean_quad(BaseIri, Quad, CleanQuad)