% This file is part of the Attempto Parsing Engine (APE). % Copyright 2008-2013, Attempto Group, University of Zurich (see http://attempto.ifi.uzh.ch). % % The Attempto Parsing Engine (APE) is free software: you can redistribute it and/or modify it % under the terms of the GNU Lesser General Public License as published by the Free Software % Foundation, either version 3 of the License, or (at your option) any later version. % % The Attempto Parsing Engine (APE) is distributed in the hope that it will be useful, but WITHOUT % ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR % PURPOSE. See the GNU Lesser General Public License for more details. % % You should have received a copy of the GNU Lesser General Public License along with the Attempto % Parsing Engine (APE). If not, see http://www.gnu.org/licenses/. :- module(ace_niceace, [ tokens_to_sentences/2, atom_capitalize/2, pronoun_split/2, pronoun_split/3, ace_niceace/2, word_article/2 ]). :- use_module('../lexicon/chars', [ is_sentence_end_symbol/1 ]). /** ACE beautifier @author Kaarel Kaljurand @version 2012-01-04 This code does the following: * a -> an (if appropriate) * connect every comma, period, and question mark to the preceding word */ %% tokens_to_sentences(+ListOfList:list, -ListOfAtom:list) is det. % % @param ListOfList is a list (ACE sentences) of lists (ACE tokens) % @param ListOfAtom is a list of atoms (ACE sentences) % % @bug get rid of 'ERROR' (maybe fail in this case?) % % called from: get_ape_results % tokens_to_sentences([], []). tokens_to_sentences([TokenList | Tail], [Atom | RestT]) :- ( TokenList = [] -> Atom = 'ERROR' ; ace_niceace(TokenList, [FirstToken | RestTokenList]), atom_capitalize(FirstToken, FirstTokenCapitalized), concat_atom([FirstTokenCapitalized | RestTokenList], ' ', Atom) ), tokens_to_sentences(Tail, RestT). %% pronoun_split(+Token:atom, -TokenPair:term) is semidet. %% pronoun_split(-Token:atom, +TokenPair:term) is nondet. % % @param Token is an ACE token % @param TokenPair is a list of 2 tokens, the 2nd of which is one of {-thing, -body, -one} % pronoun_split(Token, TokenPair) :- pronoun_split(Token, _, TokenPair). pronoun_split(everything, lower, (every, '-thing')). pronoun_split('Everything', upper, (every, '-thing')). pronoun_split(nothing, lower, (no, '-thing')). pronoun_split('Nothing', upper, (no, '-thing')). pronoun_split(something, lower, (a, '-thing')). pronoun_split('Something', upper, (a, '-thing')). pronoun_split(everybody, lower, (every, '-body')). pronoun_split('Everybody', upper, (every, '-body')). pronoun_split(nobody, lower, (no, '-body')). pronoun_split('Nobody', upper, (no, '-body')). pronoun_split(somebody, lower, (a, '-body')). pronoun_split('Somebody', upper, (a, '-body')). pronoun_split(everyone, lower, (every, '-one')). pronoun_split('Everyone', upper, (every, '-one')). pronoun_split(noone, lower, (no, '-one')). pronoun_split('Noone', upper, (no, '-one')). pronoun_split(someone, lower, (a, '-one')). pronoun_split('Someone', upper, (a, '-one')). %% atom_capitalize(+Atom:atom, -CapitalizedAtom:atom) is det. % % Simple predicate to capitalize those ACE words which can occur % in the beginning of the sentence. % % TODO: every preposition can also start a sentence % atom_capitalize(a, 'A') :- !. atom_capitalize(the, 'The') :- !. atom_capitalize(somebody, 'Somebody') :- !. atom_capitalize(something, 'Something') :- !. atom_capitalize(at, 'At') :- !. atom_capitalize(less, 'Less') :- !. atom_capitalize(more, 'More') :- !. atom_capitalize(exactly, 'Exactly') :- !. atom_capitalize(some, 'Some') :- !. atom_capitalize(an, 'An') :- !. atom_capitalize(there, 'There') :- !. atom_capitalize(if, 'If') :- !. atom_capitalize(it, 'It') :- !. atom_capitalize(is, 'Is') :- !. atom_capitalize(are, 'Are') :- !. atom_capitalize(do, 'Do') :- !. atom_capitalize(does, 'Does') :- !. atom_capitalize(for, 'For') :- !. atom_capitalize(not, 'Not') :- !. atom_capitalize(each, 'Each') :- !. atom_capitalize(every, 'Every') :- !. atom_capitalize(everything, 'Everything') :- !. atom_capitalize(everybody, 'Everybody') :- !. atom_capitalize(no, 'No') :- !. atom_capitalize(nothing, 'Nothing') :- !. atom_capitalize(nobody, 'Nobody') :- !. atom_capitalize(all, 'All') :- !. atom_capitalize(who, 'Who') :- !. atom_capitalize(whose, 'Whose') :- !. atom_capitalize(what, 'What') :- !. atom_capitalize(which, 'Which') :- !. atom_capitalize(where, 'Where') :- !. atom_capitalize(when, 'When') :- !. atom_capitalize(how, 'How') :- !. atom_capitalize(can, 'Can') :- !. atom_capitalize(must, 'Must') :- !. atom_capitalize(should, 'Should') :- !. atom_capitalize(may, 'May') :- !. atom_capitalize(he, 'He') :- !. atom_capitalize(his, 'His') :- !. atom_capitalize(she, 'She') :- !. atom_capitalize(her, 'Her') :- !. atom_capitalize(they, 'They') :- !. atom_capitalize(their, 'Their') :- !. atom_capitalize(its, 'Its') :- !. atom_capitalize(Token, Token). %% ace_niceace(+TokenListIn:list, -TokenListOut:list) is det. % % @param TokenListIn is a list of ACE tokens % @param TokenListOut is a list of ACE tokens % % @tbd Some of these transformations (e.g. a -> an) should % be optional. % Strip the sentence start marker (^) if present. ace_niceace([^ | In], Out) :- !, ace_niceace_x(In, Out). ace_niceace(In, Out) :- ace_niceace_x(In, Out). ace_niceace_x([], []) :- !. ace_niceace_x(In, Out) :- ace_merge(In, Prefix, Rest), simple_append(Prefix, RestOut, Out), ace_niceace_x(Rest, RestOut). %% ace_merge(+TokenList:list, -Prefix:list, -NewTokenList:list) is nondet. % % @param TokenList is a list of ACE tokens % @param Prefix is a list of ACE tokens % @param NewTokenList is a list of ACE tokens % ace_merge([Tok1, Tok2 | Rest], [Tok1Tok2], Rest) :- pronoun_split(Tok1Tok2, (Tok1, Tok2)), !. ace_merge([a, Prefix, ':', Token | Rest], [Article], [Prefix, ':', Token | Rest]) :- member(Prefix, [n, a, unknowncat]), !, word_article(Token, Article). ace_merge([a, Token | Rest], [Article], [Token | Rest]) :- Token \= ':', !, word_article(Token, Article). ace_merge([Token, SentenceEndSym | Rest], [TokenPeriod], Rest) :- is_sentence_end_symbol(SentenceEndSym), !, concat_atom([Token, SentenceEndSym], TokenPeriod). ace_merge([Token, ',' | Rest], [TokenComma], Rest) :- !, concat_atom([Token, ','], TokenComma). ace_merge([Prefix, ':', Token | Rest], [PrefixToken], Rest) :- member(Prefix, [n, v, p, a]), !, concat_atom([Prefix, ':', Token], PrefixToken). ace_merge([unknowncat, ':', Token | Rest], [Token], Rest) :- !. ace_merge([Token | Rest], [Token], Rest). %% simple_append(?List1:list, ?List2:list, ?List3:list) is nondet. % % @param List1 is an empty list or a list of one element % @param List2 is a list % @param List3 is a list % % This is a special case of append/2 % simple_append([], List, List). simple_append([X], List, [X | List]). %% word_article(+Word:atom, -Article:atom) is det. % % This code decides on the article (of the noun phrase) % on the basis of a word (either adjective or noun). % % See also: % % @param Word is an ACE token % @param Article is an ACE indefinite article, one of {a, an} % word_article(Word, an) :- downcase_atom(Word, DowncaseWord), atom_chars(DowncaseWord, WordChars), good_an_letters(WordChars), \+ bad_an_letters(WordChars), !. word_article(_, a). %% good_an_letters(?LetterList:list) is nondet. % % @param LetterList is a list of letters that a word following 'an' can consist of % good_an_letters([a | _]). good_an_letters([e | _]). good_an_letters([i | _]). good_an_letters([o | _]). good_an_letters([u | _]). good_an_letters([h, o, n, o, r, a, b, l, e | _]). good_an_letters([h, e, i, r | _]). good_an_letters([h, o, u, r | _]). good_an_letters([f]). good_an_letters([h]). good_an_letters([l]). good_an_letters([m]). good_an_letters([n]). good_an_letters([r]). good_an_letters([s]). good_an_letters([x]). good_an_letters([f, '-' | _]). good_an_letters([h, '-' | _]). good_an_letters([l, '-' | _]). good_an_letters([m, '-' | _]). good_an_letters([n, '-' | _]). good_an_letters([r, '-' | _]). good_an_letters([s, '-' | _]). good_an_letters([x, '-' | _]). %% bad_an_letters(?LetterList:list) is nondet. % % @param LetterList is a list of letters that a word following 'an' cannot consist of % bad_an_letters([u]). bad_an_letters([u, '-' | _]). bad_an_letters([u, r, i | _]). bad_an_letters([u, t, i | _]). bad_an_letters([u, n, i | _]). bad_an_letters([u, s, a | _]). bad_an_letters([u, s, e | _]). %bad_an_letters([u, k, '-' | _]). bad_an_letters([u, k | _]). bad_an_letters([o, n, e | _]).