% This file is part of the Attempto Parsing Engine (APE). % Copyright 2008-2013, Attempto Group, University of Zurich (see http://attempto.ifi.uzh.ch). % % The Attempto Parsing Engine (APE) is free software: you can redistribute it and/or modify it % under the terms of the GNU Lesser General Public License as published by the Free Software % Foundation, either version 3 of the License, or (at your option) any later version. % % The Attempto Parsing Engine (APE) is distributed in the hope that it will be useful, but WITHOUT % ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR % PURPOSE. See the GNU Lesser General Public License for more details. % % You should have received a copy of the GNU Lesser General Public License along with the Attempto % Parsing Engine (APE). If not, see http://www.gnu.org/licenses/. :- module(ape_utils, [ cpu_time/2, handle_unknown_words/4, new_npid/1 ]). /** APE utils @author Kaarel Kaljurand @author Tobias Kuhn @version 2009-11-18 */ :- use_module(grammar_words). :- use_module('../lexicon/spellcheck'). :- use_module('../lexicon/is_in_lexicon'). :- use_module('../lexicon/functionwords'). :- use_module('../lexicon/chars', [is_capitalized/1]). :- use_module('../logger/error_logger', [ add_error_messagelist/4, add_error_message/4 ]). :- use_module(sentence_failure, [ get_all_sentence_errors/2 ]). :- use_module('../utils/ace_niceace', [ ace_niceace/2 ]). :- op(400, fy, -). :- op(400, fy, ~). :- op(500, xfx, =>). :- op(500, xfx, v). %:- debug(known_word). :-export(trace_failure/1). :-meta_predicate(trace_failure(0)). trace_failure(Goal):- (Goal *-> true ; (nop(trace),Goal)). cpu_time(Goal, Duration) :- statistics(runtime, [Start | _]), ignore(trace_failure(Goal)), statistics(runtime, [Finish | _]), Duration is (Finish-Start) * 0.001. % @deprecated read_text_from_commandline([C|R]) :- get0(C), line_continues(C), !, read_text_from_commandline(R). read_text_from_commandline([]). % @deprecated line_continues(C) :- C \== 10, C \== -1, !. %% list_close(+OpenList, -ClosedList) is det. % % Closes an open list. list_close([], []) :- !. list_close([Head | Tail], [Head | ClosedTail]) :- list_close(Tail, ClosedTail). %% member_of_domain(+Element:var, +List:list) is det. % % member/2 for variables % % @bug we should not catch wellformedness bugs here, but in the beginning. % (Dom must always be a closed list) % % Note: Can Dom be open during parsing? (member_of_domain/2 is called % from add_modifiers) member_of_domain(E, [F | _]) :- E==F, !. member_of_domain(E, [_ | R]) :- nonvar(R), % catches a bug member_of_domain(E,R). %% list_of_conds_and_anaphors(+List, -List, -AnaphorAnteList) % % Filters out functors `antecedent' and `anaphor' and closes the list. % Copies all `anaphor' and `antecedent' functors into the extra argument. % % We have to do it recursively for the following tests to work. % % - There is a man who sees no dog. The man who sees no dog waits. % - There is a man who sees every dog. The man who sees every dog waits. % - There is a man who sees a dog or who sees a cat. The man who sees a dog or who sees a cat waits. % - @bug also: can, must, can't, does not have to % % @bug Note that we ignore the inner anaphors for the time being. % Some theoretical work is needed to find out if they are needed. % % @bug Type-argument is currently ignored. list_of_conds_and_anaphors(Tail, [], []) :- var(Tail), !. list_of_conds_and_anaphors([], [], []). list_of_conds_and_anaphors([Head | Tail], CondsTail, [Head|AnaAnte]) :- functor(Head, antecedent, _), !, list_of_conds_and_anaphors(Tail, CondsTail, AnaAnte). list_of_conds_and_anaphors([Head | Tail], CondsTail, [Head|AnaAnte]) :- Head = anaphor(_, _, _, AnaphorConds, _, _, _, _, _, _, _), !, list_of_conds_and_anaphors(Tail, CondsTailTemp1, AnaAnte), append(AnaphorConds, CondsTailTemp1, CondsTailTemp2), sort(CondsTailTemp2, CondsTail). list_of_conds_and_anaphors([Condition | CondsTail], [ConditionPruned | CondsTailPruned], AnaAnte) :- handle_condition(Condition, ConditionPruned), !, list_of_conds_and_anaphors(CondsTail, CondsTailPruned, AnaAnte). % Simple condition. list_of_conds_and_anaphors([Head | Tail], [Head | CondsTail], AnaAnte) :- list_of_conds_and_anaphors(Tail, CondsTail, AnaAnte). %% handle_condition(+Condition, -ConditionPruned) % % handle_condition([First|Rest], CondsPruned) :- list_of_conds_and_anaphors([First|Rest], CondsPruned, _). handle_condition(-drs(Dom, Conds), -drs(Dom, CondsPruned)) :- list_of_conds_and_anaphors(Conds, CondsPruned, _). handle_condition(Label:drs(Dom, Conds), Label:drs(Dom, CondsPruned)) :- list_of_conds_and_anaphors(Conds, CondsPruned, _). handle_condition(can(drs(Dom, Conds)), can(drs(Dom, CondsPruned))) :- list_of_conds_and_anaphors(Conds, CondsPruned, _). handle_condition(must(drs(Dom, Conds)), must(drs(Dom, CondsPruned))) :- list_of_conds_and_anaphors(Conds, CondsPruned, _). handle_condition(should(drs(Dom, Conds)), should(drs(Dom, CondsPruned))) :- list_of_conds_and_anaphors(Conds, CondsPruned, _). handle_condition(may(drs(Dom, Conds)), may(drs(Dom, CondsPruned))) :- list_of_conds_and_anaphors(Conds, CondsPruned, _). handle_condition(~drs(Dom, Conds), ~drs(Dom, CondsPruned)) :- list_of_conds_and_anaphors(Conds, CondsPruned, _). handle_condition(drs(Dom1, Conds1) => drs(Dom2, Conds2), drs(Dom1, Conds1Pruned) => drs(Dom2, Conds2Pruned)) :- list_of_conds_and_anaphors(Conds1, Conds1Pruned, _), list_of_conds_and_anaphors(Conds2, Conds2Pruned, _). handle_condition(drs(Dom1, Conds1) v drs(Dom2, Conds2), drs(Dom1, Conds1Pruned) v drs(Dom2, Conds2Pruned)) :- list_of_conds_and_anaphors(Conds1, Conds1Pruned, _), list_of_conds_and_anaphors(Conds2, Conds2Pruned, _). %% report_failed_sentence(+SentenceID:number, +FirstUnparsedSentence:list) is det. % % @param SentenceID is a sentence ID % @param UnparsedSentences is a list of (remaining) tokens that were not accepted % % @bug We also return the token ID that identifies the first unparsed token. % This ID is misleading though as it references a token in the internal token % list which contains things like '^', '-thing', 'n', ':', etc. Thus the number can % point to a location further than the actual location perceived by the user. % (We do subtract 1 to account for the '^' symbol present in every token list, i.e. otherwise % the formula would be: TokenID is (AllTokensNumber - UnparsedTokensNumber) + 1). % report_failed_sentence(SentenceID, FirstSentence) :- get_unparsed_tokens_number(UnparsedTokensNumber), length(UnparsedTokens, UnparsedTokensNumber), append(ParsedTokens, UnparsedTokens, FirstSentence), append(ParsedTokens, ['<>'|UnparsedTokens], FirstSentenceX), get_all_sentence_errors(FirstSentenceX, ErrorTextList), ace_niceace(FirstSentenceX, FirstSentenceNice), concat_atom(FirstSentenceNice, ' ', FirstSentenceAtom), length(FirstSentence, AllTokensNumber), TokenID is (AllTokensNumber - UnparsedTokensNumber), add_error_messagelist(sentence, SentenceID-TokenID, FirstSentenceAtom, ErrorTextList). %% spellcheck_and_report_unknown_words(+UnknownTokenList) % % @param UnknownTokenList is a list of unknown tokens. % spellcheck_and_report_unknown_words(UnknownTokenList, N) :- spellcheck:damerau_rules(UnknownTokenList, Candidates, NoSolutions), !, report_candidates(Candidates, N), report_nosolutions(NoSolutions, N). report_candidates([], _). report_candidates([Bad-Good | T], N) :- add_error_message(word, N-'', Bad, Good), report_candidates(T, N). report_nosolutions([], _). report_nosolutions([Bad | T], N) :- add_error_message(word, N-'', Bad, 'Use the prefix n:, v:, a: or p:.'), report_nosolutions(T, N). %% handle_unknown_words(+GuessOnOff:atom, +Tokens:list, -TokensWithPrefixes:list) is det. handle_unknown_words(_, [], [], _). handle_unknown_words(on, [SentenceIn|RestIn], [SentenceOut|RestOut], _) :- !, prefix_unknown_words(SentenceIn, SentenceOut), handle_unknown_words(on, RestIn, RestOut, _). handle_unknown_words(G, [Sentence|Rest], [Sentence|Rest], N) :- get_unknown_words(Sentence, UnknownTokens), spellcheck_and_report_unknown_words(UnknownTokens, N), NextN is N + 1, handle_unknown_words(G, Rest, Rest, NextN). %% get_unknown_words(+TokenList, -UnknownTokenList) % % @bug repeated unknowns are not removed. Ways to fix it: % - sort/2 the list in the end % - use some hashing technique get_unknown_words(TokenList, UnknownTokenList) :- get_unknown_words(TokenList, [x, x], UnknownTokenList). get_unknown_words([], _, []). get_unknown_words([Token | TokensTail], [Left2, Left1], UnknownTokenList) :- get_right_context(TokensTail, Right1), ( known_or_capitalized(Left2, Left1, Token, Right1) -> UnknownTokenList = UnknownTokenListTail ; UnknownTokenList = [Token | UnknownTokenListTail] ), get_unknown_words(TokensTail, [Left1, Token], UnknownTokenListTail). %% known_or_capitalized(+Left2:atom, +Left1:atom, +WordForm:atom, +Right1:atom) is det. % known_or_capitalized(_, _, WordForm, _) :- chars:is_capitalized(WordForm), !. known_or_capitalized(Left2, Left1, WordForm, Right1) :- debug(known_word, "~w ~w *~w* ~w~n", [Left2, Left1, WordForm, Right1]), known_word(Left2, Left1, WordForm, Right1), debug(known_word, "ok~n", []). %% prefix_unknown_words(+TokenList, -TokenListWithPrefixes) % % Add variable prefixes to unknown words. The prefixes are grounded % by the first successful parse. This essentially amounts to guessing % the word class (noun, verb, adjective, adverb) of unknown words. % % Note that propernames are difficult to guess since they are not introduced % by a marker (like 'a' or 'every'). Adding a variable prefix makes the variable % unify also with function words (e.g. 'if'). This causes a performance problem % and doesn't work anyway. So, for the time being, we simply don't guess % propernames (eventhough propernames deserve guessing the most, being a very % open class). prefix_unknown_words(TokenList, TokenListWithPrefixes) :- prefix_unknown_words(TokenList, [x, x], TokenListWithPrefixes). prefix_unknown_words([], _, []). prefix_unknown_words([Token | TokensTail], [Left2, Left1], TokenListWithPrefixes1) :- get_right_context(TokensTail, Right1), ( known_or_capitalized(Left2, Left1, Token, Right1) -> TokenListWithPrefixes1 = [Token | TokenListWithPrefixes] ; TokenListWithPrefixes1 = [unknowncat, :, Token | TokenListWithPrefixes] ), prefix_unknown_words(TokensTail, [Left1, Token], TokenListWithPrefixes). %% get_right_context(+List, -FirstElement). % % Returns the first element of the list, or 'x' if the list is empty. % % @bug This predicate is experimental. get_right_context([], x). get_right_context([Head | _], Head). %% known_word(+Left2:atom, +Left1:atom, +Token:atom, +Right1:atom) is det. % % Token is "known" iff: % % it is in quotation marks; % it is prefixed; % it is a prefix symbol; % it is in the lexicon. % known_word(_, _, String, _) :- atom_concat('"', S, String), atom_concat(_, '"', S). known_word(Left2, ':', _, _) :- ( functionwords:propername_prefix(Left2, _) ; functionwords:noun_prefix(Left2, _) ; functionwords:verb_prefix(Left2) ; functionwords:modif_prefix(Left2) ), !. known_word(_, _, Prefix, ':') :- ( functionwords:propername_prefix(Prefix, _) ; functionwords:noun_prefix(Prefix, _) ; functionwords:verb_prefix(Prefix) ; functionwords:modif_prefix(Prefix) ), !. known_word(_, _, WordForm, _) :- is_in_lexicon:is_in_lexicon(WordForm). %---------------------------------------------------------------------------- % new_npid(-NPID:number) is det. % % new_npid delivers the next idenfifier (ID) for an NP. % NP IDs are used for anaphora resolution recency calculations. %---------------------------------------------------------------------------- new_npid(NewNPID) :- b_getval(npid, OldNPID), NewNPID is OldNPID + 1, b_setval(npid, NewNPID).