1% This file is part of the Attempto Parsing Engine (APE). 2% Copyright 2008-2013, Attempto Group, University of Zurich (see http://attempto.ifi.uzh.ch). 3% 4% The Attempto Parsing Engine (APE) is free software: you can redistribute it and/or modify it 5% under the terms of the GNU Lesser General Public License as published by the Free Software 6% Foundation, either version 3 of the License, or (at your option) any later version. 7% 8% The Attempto Parsing Engine (APE) is distributed in the hope that it will be useful, but WITHOUT 9% ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR 10% PURPOSE. See the GNU Lesser General Public License for more details. 11% 12% You should have received a copy of the GNU Lesser General Public License along with the Attempto 13% Parsing Engine (APE). If not, see http://www.gnu.org/licenses/. 14 15 16:- module(ace_niceace, [ 17 tokens_to_sentences/2, 18 atom_capitalize/2, 19 pronoun_split/2, 20 pronoun_split/3, 21 ace_niceace/2, 22 word_article/2 23 ]). 24 25 26:- use_module('../lexicon/chars', [ 27 is_sentence_end_symbol/1 28 ]).
52tokens_to_sentences([], []). 53 54tokens_to_sentences([TokenList | Tail], [Atom | RestT]) :- 55 ( 56 TokenList = [] 57 -> 58 Atom = 'ERROR' 59 ; 60 ace_niceace(TokenList, [FirstToken | RestTokenList]), 61 atom_capitalize(FirstToken, FirstTokenCapitalized), 62 concat_atom([FirstTokenCapitalized | RestTokenList], ' ', Atom) 63 ), 64 tokens_to_sentences(Tail, RestT).
73pronoun_split(Token, TokenPair) :- 74 pronoun_split(Token, _, TokenPair). 75 76pronoun_split(everything, lower, (every, '-thing')). 77pronoun_split('Everything', upper, (every, '-thing')). 78pronoun_split(nothing, lower, (no, '-thing')). 79pronoun_split('Nothing', upper, (no, '-thing')). 80pronoun_split(something, lower, (a, '-thing')). 81pronoun_split('Something', upper, (a, '-thing')). 82 83pronoun_split(everybody, lower, (every, '-body')). 84pronoun_split('Everybody', upper, (every, '-body')). 85pronoun_split(nobody, lower, (no, '-body')). 86pronoun_split('Nobody', upper, (no, '-body')). 87pronoun_split(somebody, lower, (a, '-body')). 88pronoun_split('Somebody', upper, (a, '-body')). 89 90pronoun_split(everyone, lower, (every, '-one')). 91pronoun_split('Everyone', upper, (every, '-one')). 92pronoun_split(noone, lower, (no, '-one')). 93pronoun_split('Noone', upper, (no, '-one')). 94pronoun_split(someone, lower, (a, '-one')). 95pronoun_split('Someone', upper, (a, '-one')).
TODO
: every preposition can also start a sentence
105atom_capitalize(a, 'A') :- !. 106atom_capitalize(the, 'The') :- !. 107atom_capitalize(somebody, 'Somebody') :- !. 108atom_capitalize(something, 'Something') :- !. 109atom_capitalize(at, 'At') :- !. 110atom_capitalize(less, 'Less') :- !. 111atom_capitalize(more, 'More') :- !. 112atom_capitalize(exactly, 'Exactly') :- !. 113atom_capitalize(some, 'Some') :- !. 114atom_capitalize(an, 'An') :- !. 115 116atom_capitalize(there, 'There') :- !. 117atom_capitalize(if, 'If') :- !. 118atom_capitalize(it, 'It') :- !. 119atom_capitalize(is, 'Is') :- !. 120atom_capitalize(are, 'Are') :- !. 121atom_capitalize(do, 'Do') :- !. 122atom_capitalize(does, 'Does') :- !. 123atom_capitalize(for, 'For') :- !. 124atom_capitalize(not, 'Not') :- !. 125 126atom_capitalize(each, 'Each') :- !. 127atom_capitalize(every, 'Every') :- !. 128atom_capitalize(everything, 'Everything') :- !. 129atom_capitalize(everybody, 'Everybody') :- !. 130atom_capitalize(no, 'No') :- !. 131atom_capitalize(nothing, 'Nothing') :- !. 132atom_capitalize(nobody, 'Nobody') :- !. 133atom_capitalize(all, 'All') :- !. 134 135atom_capitalize(who, 'Who') :- !. 136atom_capitalize(whose, 'Whose') :- !. 137atom_capitalize(what, 'What') :- !. 138atom_capitalize(which, 'Which') :- !. 139atom_capitalize(where, 'Where') :- !. 140atom_capitalize(when, 'When') :- !. 141atom_capitalize(how, 'How') :- !. 142 143atom_capitalize(can, 'Can') :- !. 144atom_capitalize(must, 'Must') :- !. 145atom_capitalize(should, 'Should') :- !. 146atom_capitalize(may, 'May') :- !. 147 148atom_capitalize(he, 'He') :- !. 149atom_capitalize(his, 'His') :- !. 150atom_capitalize(she, 'She') :- !. 151atom_capitalize(her, 'Her') :- !. 152atom_capitalize(they, 'They') :- !. 153atom_capitalize(their, 'Their') :- !. 154atom_capitalize(its, 'Its') :- !. 155 156atom_capitalize(Token, Token).
167% Strip the sentence start marker (^) if present. 168ace_niceace([^ | In], Out) :- 169 !, 170 ace_niceace_x(In, Out). 171 172ace_niceace(In, Out) :- 173 ace_niceace_x(In, Out). 174 175 176ace_niceace_x([], []) :- 177 !. 178 179ace_niceace_x(In, Out) :- 180 ace_merge(In, Prefix, Rest), 181 simple_append(Prefix, RestOut, Out), 182 ace_niceace_x(Rest, RestOut).
191ace_merge([Tok1, Tok2 | Rest], [Tok1Tok2], Rest) :- 192 pronoun_split(Tok1Tok2, (Tok1, Tok2)), 193 !. 194 195ace_merge([a, Prefix, ':', Token | Rest], [Article], [Prefix, ':', Token | Rest]) :- 196 member(Prefix, [n, a, unknowncat]), 197 !, 198 word_article(Token, Article). 199 200ace_merge([a, Token | Rest], [Article], [Token | Rest]) :- 201 Token \= ':', 202 !, 203 word_article(Token, Article). 204 205ace_merge([Token, SentenceEndSym | Rest], [TokenPeriod], Rest) :- 206 is_sentence_end_symbol(SentenceEndSym), 207 !, 208 concat_atom([Token, SentenceEndSym], TokenPeriod). 209 210ace_merge([Token, ',' | Rest], [TokenComma], Rest) :- 211 !, 212 concat_atom([Token, ','], TokenComma). 213 214ace_merge([Prefix, ':', Token | Rest], [PrefixToken], Rest) :- 215 member(Prefix, [n, v, p, a]), 216 !, 217 concat_atom([Prefix, ':', Token], PrefixToken). 218 219ace_merge([unknowncat, ':', Token | Rest], [Token], Rest) :- 220 !. 221 222ace_merge([Token | Rest], [Token], Rest).
233simple_append([], List, List). 234simple_append([X], List, [X | List]).
See also: http://en.wikipedia.org/wiki/A_and_an
247word_article(Word, an) :- 248 downcase_atom(Word, DowncaseWord), 249 atom_chars(DowncaseWord, WordChars), 250 good_an_letters(WordChars), 251 \+ bad_an_letters(WordChars), 252 !. 253 254word_article(_, a).
261good_an_letters([a | _]). 262good_an_letters([e | _]). 263good_an_letters([i | _]). 264good_an_letters([o | _]). 265good_an_letters([u | _]). 266good_an_letters([h, o, n, o, r, a, b, l, e | _]). 267good_an_letters([h, e, i, r | _]). 268good_an_letters([h, o, u, r | _]). 269 270good_an_letters([f]). 271good_an_letters([h]). 272good_an_letters([l]). 273good_an_letters([m]). 274good_an_letters([n]). 275good_an_letters([r]). 276good_an_letters([s]). 277good_an_letters([x]). 278 279good_an_letters([f, '-' | _]). 280good_an_letters([h, '-' | _]). 281good_an_letters([l, '-' | _]). 282good_an_letters([m, '-' | _]). 283good_an_letters([n, '-' | _]). 284good_an_letters([r, '-' | _]). 285good_an_letters([s, '-' | _]). 286good_an_letters([x, '-' | _]).
293bad_an_letters([u]). 294bad_an_letters([u, '-' | _]). 295bad_an_letters([u, r, i | _]). 296bad_an_letters([u, t, i | _]). 297bad_an_letters([u, n, i | _]). 298bad_an_letters([u, s, a | _]). 299bad_an_letters([u, s, e | _]). 300%bad_an_letters([u, k, '-' | _]). 301bad_an_letters([u, k | _]). 302bad_an_letters([o, n, e | _])
ACE beautifier
This code does the following: