1% This file is part of the Attempto Parsing Engine (APE).
    2% Copyright 2008-2013, Attempto Group, University of Zurich (see http://attempto.ifi.uzh.ch).
    3%
    4% The Attempto Parsing Engine (APE) is free software: you can redistribute it and/or modify it
    5% under the terms of the GNU Lesser General Public License as published by the Free Software
    6% Foundation, either version 3 of the License, or (at your option) any later version.
    7%
    8% The Attempto Parsing Engine (APE) is distributed in the hope that it will be useful, but WITHOUT
    9% ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
   10% PURPOSE. See the GNU Lesser General Public License for more details.
   11%
   12% You should have received a copy of the GNU Lesser General Public License along with the Attempto
   13% Parsing Engine (APE). If not, see http://www.gnu.org/licenses/.
   14
   15
   16:- module(ace_niceace, [
   17		tokens_to_sentences/2,
   18		atom_capitalize/2,
   19		pronoun_split/2,
   20		pronoun_split/3,
   21		ace_niceace/2,
   22		word_article/2
   23	]).   24
   25
   26:- use_module('../lexicon/chars', [
   27		is_sentence_end_symbol/1
   28	]).

ACE beautifier

author

- Kaarel Kaljurand

version

- 2012-01-04

This code does the following:

a -> an (if appropriate)
connect every comma, period, and question mark to the preceding word */

tokens_to_sentences(+ListOfList:list, -ListOfAtom:list) is det

Arguments:

`ListOfList`	- is a list (ACE sentences) of lists (ACE tokens)
`ListOfAtom`	- is a list of atoms (ACE sentences)

bug

- get rid of 'ERROR' (maybe fail in this case?)

called from: get_ape_results

   52tokens_to_sentences([], []).
   53
   54tokens_to_sentences([TokenList | Tail], [Atom | RestT]) :-
   55	(
   56		TokenList = []
   57	->
   58		Atom = 'ERROR'
   59    ;
   60		ace_niceace(TokenList, [FirstToken | RestTokenList]),
   61		atom_capitalize(FirstToken, FirstTokenCapitalized),
   62		concat_atom([FirstTokenCapitalized | RestTokenList], ' ', Atom)
   63	),
   64	tokens_to_sentences(Tail, RestT).

pronoun_split(+Token:atom, -TokenPair:term) is semidet

pronoun_split(-Token:atom, +TokenPair:term) is nondet

Arguments:

`Token`	- is an ACE token
`TokenPair`	- is a list of 2 tokens, the 2nd of which is one of {-thing, -body, -one}

   73pronoun_split(Token, TokenPair) :-
   74	pronoun_split(Token, _, TokenPair).
   75
   76pronoun_split(everything, lower, (every, '-thing')).
   77pronoun_split('Everything', upper, (every, '-thing')).
   78pronoun_split(nothing, lower, (no, '-thing')).
   79pronoun_split('Nothing', upper, (no, '-thing')).
   80pronoun_split(something, lower, (a, '-thing')).
   81pronoun_split('Something', upper, (a, '-thing')).
   82
   83pronoun_split(everybody, lower, (every, '-body')).
   84pronoun_split('Everybody', upper, (every, '-body')).
   85pronoun_split(nobody, lower, (no, '-body')).
   86pronoun_split('Nobody', upper, (no, '-body')).
   87pronoun_split(somebody, lower, (a, '-body')).
   88pronoun_split('Somebody', upper, (a, '-body')).
   89
   90pronoun_split(everyone, lower, (every, '-one')).
   91pronoun_split('Everyone', upper, (every, '-one')).
   92pronoun_split(noone, lower, (no, '-one')).
   93pronoun_split('Noone', upper, (no, '-one')).
   94pronoun_split(someone, lower, (a, '-one')).
   95pronoun_split('Someone', upper, (a, '-one')).

atom_capitalize(+Atom:atom, -CapitalizedAtom:atom) is det: Simple predicate to capitalize those ACE words which can occur in the beginning of the sentence.
TODO: every preposition can also start a sentence

  105atom_capitalize(a, 'A') :- !.
  106atom_capitalize(the, 'The') :- !.
  107atom_capitalize(somebody, 'Somebody') :- !.
  108atom_capitalize(something, 'Something') :- !.
  109atom_capitalize(at, 'At') :- !.
  110atom_capitalize(less, 'Less') :- !.
  111atom_capitalize(more, 'More') :- !.
  112atom_capitalize(exactly, 'Exactly') :- !.
  113atom_capitalize(some, 'Some') :- !.
  114atom_capitalize(an, 'An') :- !.
  115
  116atom_capitalize(there, 'There') :- !.
  117atom_capitalize(if, 'If') :- !.
  118atom_capitalize(it, 'It') :- !.
  119atom_capitalize(is, 'Is') :- !.
  120atom_capitalize(are, 'Are') :- !.
  121atom_capitalize(do, 'Do') :- !.
  122atom_capitalize(does, 'Does') :- !.
  123atom_capitalize(for, 'For') :- !.
  124atom_capitalize(not, 'Not') :- !.
  125
  126atom_capitalize(each, 'Each') :- !.
  127atom_capitalize(every, 'Every') :- !.
  128atom_capitalize(everything, 'Everything') :- !.
  129atom_capitalize(everybody, 'Everybody') :- !.
  130atom_capitalize(no, 'No') :- !.
  131atom_capitalize(nothing, 'Nothing') :- !.
  132atom_capitalize(nobody, 'Nobody') :- !.
  133atom_capitalize(all, 'All') :- !.
  134
  135atom_capitalize(who, 'Who') :- !.
  136atom_capitalize(whose, 'Whose') :- !.
  137atom_capitalize(what, 'What') :- !.
  138atom_capitalize(which, 'Which') :- !.
  139atom_capitalize(where, 'Where') :- !.
  140atom_capitalize(when, 'When') :- !.
  141atom_capitalize(how, 'How') :- !.
  142
  143atom_capitalize(can, 'Can') :- !.
  144atom_capitalize(must, 'Must') :- !.
  145atom_capitalize(should, 'Should') :- !.
  146atom_capitalize(may, 'May') :- !.
  147
  148atom_capitalize(he, 'He') :- !.
  149atom_capitalize(his, 'His') :- !.
  150atom_capitalize(she, 'She') :- !.
  151atom_capitalize(her, 'Her') :- !.
  152atom_capitalize(they, 'They') :- !.
  153atom_capitalize(their, 'Their') :- !.
  154atom_capitalize(its, 'Its') :- !.
  155
  156atom_capitalize(Token, Token).

ace_niceace(+TokenListIn:list, -TokenListOut:list) is det

Arguments:

`TokenListIn`	- is a list of ACE tokens
`TokenListOut`	- is a list of ACE tokens

To be done

- Some of these transformations (e.g. a -> an) should be optional.

  167% Strip the sentence start marker (^) if present.
  168ace_niceace([^ | In], Out) :-
  169	!,
  170	ace_niceace_x(In, Out).
  171
  172ace_niceace(In, Out) :-
  173	ace_niceace_x(In, Out).
  174
  175
  176ace_niceace_x([], []) :-
  177	!.
  178
  179ace_niceace_x(In, Out) :-
  180	ace_merge(In, Prefix, Rest),
  181	simple_append(Prefix, RestOut, Out),
  182	ace_niceace_x(Rest, RestOut).

ace_merge(+TokenList:list, -Prefix:list, -NewTokenList:list) is nondet

Arguments:

`TokenList`	- is a list of ACE tokens
`Prefix`	- is a list of ACE tokens
`NewTokenList`	- is a list of ACE tokens

  191ace_merge([Tok1, Tok2 | Rest], [Tok1Tok2], Rest) :-
  192	pronoun_split(Tok1Tok2, (Tok1, Tok2)),
  193	!.
  194
  195ace_merge([a, Prefix, ':', Token | Rest], [Article], [Prefix, ':', Token | Rest]) :-
  196    member(Prefix, [n, a, unknowncat]),
  197	!,
  198	word_article(Token, Article).
  199
  200ace_merge([a, Token | Rest], [Article], [Token | Rest]) :-
  201	Token \= ':',
  202	!,
  203	word_article(Token, Article).
  204
  205ace_merge([Token, SentenceEndSym | Rest], [TokenPeriod], Rest) :-
  206	is_sentence_end_symbol(SentenceEndSym),
  207	!,
  208	concat_atom([Token, SentenceEndSym], TokenPeriod).
  209
  210ace_merge([Token, ',' | Rest], [TokenComma], Rest) :-
  211	!,
  212	concat_atom([Token, ','], TokenComma).
  213
  214ace_merge([Prefix, ':', Token | Rest], [PrefixToken], Rest) :-
  215	member(Prefix, [n, v, p, a]),
  216	!,
  217	concat_atom([Prefix, ':', Token], PrefixToken).
  218
  219ace_merge([unknowncat, ':', Token | Rest], [Token], Rest) :-
  220	!.
  221
  222ace_merge([Token | Rest], [Token], Rest).

simple_append(?List1:list, ?List2:list, ?List3:list) is nondet

Arguments:

`List1`	- is an empty list or a list of one element
`List2`	- is a list
`List3`	- is a list This is a special case of append/2

  233simple_append([], List, List).
  234simple_append([X], List, [X | List]).

word_article(+Word:atom, -Article:atom) is det

This code decides on the article (of the noun phrase) on the basis of a word (either adjective or noun).

Arguments:

`Word`	- is an ACE token
`Article`	- is an ACE indefinite article, one of {a, an}

  247word_article(Word, an) :-
  248	downcase_atom(Word, DowncaseWord),
  249	atom_chars(DowncaseWord, WordChars),
  250	good_an_letters(WordChars),
  251	\+ bad_an_letters(WordChars),
  252	!.
  253
  254word_article(_, a).

good_an_letters(?LetterList:list) is nondet

Arguments:

LetterList

- is a list of letters that a word following 'an' can consist of

  261good_an_letters([a | _]).
  262good_an_letters([e | _]).
  263good_an_letters([i | _]).
  264good_an_letters([o | _]).
  265good_an_letters([u | _]).
  266good_an_letters([h, o, n, o, r, a, b, l, e | _]).
  267good_an_letters([h, e, i, r | _]).
  268good_an_letters([h, o, u, r | _]).
  269
  270good_an_letters([f]).
  271good_an_letters([h]).
  272good_an_letters([l]).
  273good_an_letters([m]).
  274good_an_letters([n]).
  275good_an_letters([r]).
  276good_an_letters([s]).
  277good_an_letters([x]).
  278
  279good_an_letters([f, '-' | _]).
  280good_an_letters([h, '-' | _]).
  281good_an_letters([l, '-' | _]).
  282good_an_letters([m, '-' | _]).
  283good_an_letters([n, '-' | _]).
  284good_an_letters([r, '-' | _]).
  285good_an_letters([s, '-' | _]).
  286good_an_letters([x, '-' | _]).

bad_an_letters(?LetterList:list) is nondet

Arguments:

LetterList

- is a list of letters that a word following 'an' cannot consist of

  293bad_an_letters([u]).
  294bad_an_letters([u, '-' | _]).
  295bad_an_letters([u, r, i | _]).
  296bad_an_letters([u, t, i | _]).
  297bad_an_letters([u, n, i | _]).
  298bad_an_letters([u, s, a | _]).
  299bad_an_letters([u, s, e | _]).
  300%bad_an_letters([u, k, '-' | _]).
  301bad_an_letters([u, k | _]).
  302bad_an_letters([o, n, e | _])