1:- module(html_scrapes, [scrape_row/2]). 2 3:- use_module(library(sgml)).
thead
elements.
Scrapes distinct rows. Distinct is important because HTML documents contain tables within tables within tables. Attempts to permit some flexibility. Asking for sub-rows finds head sub-rows; catches and filters out by disunifying data with heads.
15scrape_row(URL, Row) :- 16 distinct(Row, scrape_row_(URL, Row)). 17 18scrape_row_(URL, Row) :- 19 load_html(URL, DOM, []), 20 xpath(DOM, //(table), Table), 21 findall(Head, xpath(Table, //(thead)/tr/td(normalize_space), Head), Heads), 22 xpath(Table, //(tr), TR), 23 findall(Datum, xpath(TR, //(td(normalize_space)), Datum), Data), 24 Data \== Heads, 25 scrape_row__(Heads, Data, Columns), 26 Row =.. [row|Columns]. 27 28scrape_row__([], [], []). 29scrape_row__([Head0|Heads], [Datum|Data], [Column|Columns]) :- 30 restyle_identifier(one_two, Head0, Head), 31 Column =.. [Head, Datum], 32 scrape_row__(Heads, Data, Columns)