1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2017, VU University Amsterdam
    7    All rights reserved.
    8
    9    Redistribution and use in source and binary forms, with or without
   10    modification, are permitted provided that the following conditions
   11    are met:
   12
   13    1. Redistributions of source code must retain the above copyright
   14       notice, this list of conditions and the following disclaimer.
   15
   16    2. Redistributions in binary form must reproduce the above copyright
   17       notice, this list of conditions and the following disclaimer in
   18       the documentation and/or other materials provided with the
   19       distribution.
   20
   21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   32    POSSIBILITY OF SUCH DAMAGE.
   33*/
   34
   35:- module(c99_tokens,
   36          [ c99_tokens//1,              % -List
   37            c99_token//1                % -Token
   38          ]).   39:- use_module(library(dcg/basics),
   40              [ blanks//0, string//1, string_without//2, eos//0]).
 c99_tokens(-Tokens)// is semidet
Tokenize an input according to the C99 rules. Tokens are:
   70c99_tokens(List) -->
   71    c99_token(H), !,
   72    (   {skip_token(H)}
   73    ->  c99_tokens(List)
   74    ;   {List = [H|T]},
   75        c99_tokens(T)
   76    ).
   77c99_tokens([]) -->
   78    blanks.
   79
   80skip_token(pp(_)).
   81
   82c99_token(Token) -->
   83    blanks,
   84    token(Token).
 token(-Token)//
A1: recognise a C99 token.
   90token(T) --> keyword(T), \+ identifier_cont_char(_).
   91token(T) --> identifier(T).
   92token(T) --> constant(T).
   93token(T) --> string_literal(T).
   94token(T) --> pp_line(T).
   95token(T) --> punctuator(T).
   96token(T) --> header_name(T).
   97token(T) --> pp_number(T).
   98
   99keyword(auto)         --> "auto".
  100keyword(break)        --> "break".
  101keyword(case)         --> "case".
  102keyword(char)         --> "char".
  103keyword(const)        --> "const".
  104keyword(continue)     --> "continue".
  105keyword(default)      --> "default".
  106keyword(do)           --> "do".
  107keyword(double)       --> "double".
  108keyword(else)         --> "else".
  109keyword(enum)         --> "enum".
  110keyword(extern)       --> "extern".
  111keyword(float)        --> "float".
  112keyword(for)          --> "for".
  113keyword(goto)         --> "goto".
  114keyword(if)           --> "if".
  115keyword(inline)       --> "inline".
  116keyword(int)          --> "int".
  117keyword(size_t)       --> "size_t".             % Clang on Macos
  118keyword(long)         --> "long".
  119keyword(register)     --> "register".
  120keyword(restrict)     --> "restrict".
  121keyword(return)       --> "return".
  122keyword(short)        --> "short".
  123keyword(signed)       --> "signed".
  124keyword(sizeof)       --> "sizeof".
  125keyword(static)       --> "static".
  126keyword(struct)       --> "struct".
  127keyword(switch)       --> "switch".
  128keyword(typedef)      --> "typedef".
  129keyword(union)        --> "union".
  130keyword(unsigned)     --> "unsigned".
  131keyword(void)         --> "void".
  132keyword(volatile)     --> "volatile".
  133keyword(while)        --> "while".
  134keyword('_Bool')      --> "_Bool".
  135keyword('_Complex')   --> "_Complex".
  136keyword('_Imaginary') --> "_Imaginary".
  137keyword('__signed__') --> "__signed__".
  138keyword('__int128_t') --> "__int128_t".
  139keyword('__uint128_t') --> "__uint128_t".
  140keyword('_Float16')   --> "_Float16".		% GCC
  141keyword('_Float32')   --> "_Float32".		% GCC
  142keyword('_Float32x')   --> "_Float32x".		% GCC
  143keyword('_Float64')   --> "_Float64".		% GCC
  144keyword('_Float64x')   --> "_Float64x".		% GCC
  145keyword('_Float80')   --> "_Float80".		% GCC
  146keyword('_Float128')   --> "_Float128".		% GCC
  147keyword('__attribute__') --> "__attribute__".   % GCC
  148keyword('__restrict__') --> "__restrict__".
  149keyword('__restrict__') --> "__restrict".
  150keyword('__extension__') --> "__extension__".
  151keyword(inline) --> "__inline__".
  152keyword(inline) --> "__inline".
  153keyword('__builtin_va_list') --> "__builtin_va_list".
  154keyword('__gnuc_va_list') --> "__gnuc_va_list".
  155keyword('__asm__') --> "__asm__".
  156keyword('__asm__') --> "__asm".
  157keyword('__alignof__') --> "__alignof__".
  158keyword('_Nonnull') --> "_Nonnull".
  159keyword('_Nullable') --> "_Nullable".
  160
  161identifier(Id) --> identifier_nondigit(H), identifier_cont(T),
  162		   {atom_chars(I, [H|T]), Id = id(I)}.
  163
  164identifier_cont([H|T]) -->
  165    identifier_cont_char(H), !,
  166    identifier_cont(T).
  167identifier_cont([]) --> [].
  168
  169identifier_cont_char(H) -->
  170    identifier_nondigit(H), !.
  171identifier_cont_char(H) -->
  172    digit(H).
  173
  174identifier_nondigit(I) --> nondigit(I).
  175identifier_nondigit(I) --> universal_character_name(I).
  176%identifier_nondigit(I) --> other_implementation_defined_characters(I).
  177
  178term_expansion((nondigit(_) --> "_"), Clauses) :-
  179    findall((nondigit(C) --> S), nondigit_code(C, S), Clauses).
  180
  181nondigit_code(Char, [C]) :-
  182    (   C = 0'_ ; between(0'a,0'z,C) ; between(0'A,0'Z,C) ),
  183    char_code(Char, C).
  184
  185nondigit(_) --> "_".
  186
  187digit('0') --> "0".
  188digit('1') --> "1".
  189digit('2') --> "2".
  190digit('3') --> "3".
  191digit('4') --> "4".
  192digit('5') --> "5".
  193digit('6') --> "6".
  194digit('7') --> "7".
  195digit('8') --> "8".
  196digit('9') --> "9".
  197
  198universal_character_name(C) -->
  199    "\\u", hex_quad(V),
  200    { char_code(C, V) }.
  201universal_character_name(C) -->
  202    "\\U", hex_quad(V), hex_quad(W),
  203    { Code is (V<<32) + W, char_code(C, Code) }.
  204
  205hex_quad(V) -->
  206    hexadecimal_digit(X1),
  207    hexadecimal_digit(X2),
  208    hexadecimal_digit(X3),
  209    hexadecimal_digit(X4),
  210    { V is (X1<<12) + (X2<<8) + (X3<<4) + X4 }.
  211
  212
  213constant(C) --> floating_constant(C).
  214constant(C) --> integer_constant(C).
  215constant(C) --> enumeration_constant(C).
  216constant(C) --> character_constant(C).
  217
  218integer_constant(C) -->
  219    decimal_constant(D), opt_integer_suffix(S), {mkic(S,D,C)}.
  220integer_constant(C) -->
  221    hexadecimal_constant(D), opt_integer_suffix(S), {mkic(S,D,C)}.
  222integer_constant(C) -->
  223    octal_constant(D), opt_integer_suffix(S), {mkic(S,D,C)}.
  224
  225mkic(Suffix, Value, Token) :- Token =.. [Suffix,Value].
  226
  227decimal_constant(D) -->
  228    nonzero_digit(D0), digits(DT), { number_chars(D, [D0|DT]) }.
  229
  230octal_constant(D) -->
  231    "0", octal_digits(L), { L == [] -> D = 0 ; number_chars(D, ['0',o|L]) }.
  232
  233digits([H|T]) --> digit(H), !, digits(T).
  234digits([]) --> [].
  235
  236octal_digits([H|T]) --> octal_digit(H), !, octal_digits(T).
  237octal_digits([]) --> [].
  238
  239hexadecimal_constant(D) -->
  240    hexadecimal_prefix,
  241    hexadecimal_digits(0, D).
  242
  243hexadecimal_prefix --> "0x".
  244hexadecimal_prefix --> "0X".
  245
  246hexadecimal_digits(V0, V) -->
  247    hexadecimal_digit(D),
  248    !,
  249    { V1 is V0*16+D },
  250    hexadecimal_digits(V1, V).
  251hexadecimal_digits(V, V) -->
  252    [].
  253
  254nonzero_digit('1') --> "1".
  255nonzero_digit('2') --> "2".
  256nonzero_digit('3') --> "3".
  257nonzero_digit('4') --> "4".
  258nonzero_digit('5') --> "5".
  259nonzero_digit('6') --> "6".
  260nonzero_digit('7') --> "7".
  261nonzero_digit('8') --> "8".
  262nonzero_digit('9') --> "9".
  263
  264octal_digit('0') --> "0".
  265octal_digit('1') --> "1".
  266octal_digit('2') --> "2".
  267octal_digit('3') --> "3".
  268octal_digit('4') --> "4".
  269octal_digit('5') --> "5".
  270octal_digit('6') --> "6".
  271octal_digit('7') --> "7".
  272
  273hexadecimal_digit(0)  --> "0".
  274hexadecimal_digit(1)  --> "1".
  275hexadecimal_digit(2)  --> "2".
  276hexadecimal_digit(3)  --> "3".
  277hexadecimal_digit(4)  --> "4".
  278hexadecimal_digit(5)  --> "5".
  279hexadecimal_digit(6)  --> "6".
  280hexadecimal_digit(7)  --> "7".
  281hexadecimal_digit(8)  --> "8".
  282hexadecimal_digit(9)  --> "9".
  283hexadecimal_digit(10) --> "a".
  284hexadecimal_digit(11) --> "b".
  285hexadecimal_digit(12) --> "c".
  286hexadecimal_digit(13) --> "d".
  287hexadecimal_digit(14) --> "e".
  288hexadecimal_digit(15) --> "f".
  289hexadecimal_digit(10) --> "A".
  290hexadecimal_digit(11) --> "B".
  291hexadecimal_digit(12) --> "C".
  292hexadecimal_digit(13) --> "D".
  293hexadecimal_digit(14) --> "E".
  294hexadecimal_digit(15) --> "F".
 opt_integer_suffix(-Suffix)//
Bind Suffix to one of i, u, l, ll, ul, ull
  300opt_integer_suffix(S) --> unsigned_suffix, long_suffix(L), !, {mkuis(L, S)}.
  301opt_integer_suffix(S) --> long_suffix(L), unsigned_suffix, !, {mkuis(L, S)}.
  302opt_integer_suffix(u) --> unsigned_suffix, !.
  303opt_integer_suffix(L) --> long_suffix(L), !.
  304opt_integer_suffix(i) --> [].
  305
  306mkuis(l, ul).
  307mkuis(ll, ull).
  308
  309unsigned_suffix --> "u".
  310unsigned_suffix --> "U".
  311
  312long_suffix(ll) --> "ll".
  313long_suffix(ll) --> "LL".
  314long_suffix(l)  --> "l".
  315long_suffix(l)  --> "L".
  316
  317floating_constant(F) --> decimal_floating_constant(F).
  318floating_constant(F) --> hexadecimal_floating_constant(F).
  319
  320decimal_floating_constant(F) -->
  321    fractional_constant(FC),
  322    opt_exponent_part(E, _),
  323    floating_suffix(FS, _),
  324    { mkf(FS, FC, E, F) }.
  325decimal_floating_constant(F) -->
  326    digit_sequence_value(FC),
  327    opt_exponent_part(E, Expl),
  328    floating_suffix(FS, Expl),
  329    { Expl == true,
  330      mkf(FS, FC, E, F)
  331    }.
  332
  333hexadecimal_floating_constant(F) -->
  334    hexadecimal_prefix,
  335    (   "."
  336    ->  hexadecimal_fractional_part(FC)
  337    ;   hexadecimal_digits(0, IC),
  338        ".",
  339        hexadecimal_fractional_part(FCP),
  340        { FC is IC+FCP }
  341    ),
  342    binary_exponent_part(E),
  343    floating_suffix(FS, _),
  344    { mkf(FS, FC, E, F) }.
  345
  346mkf(float, FC, E, float(V)) :- V is FC*E.
  347mkf(double, FC, E, double(V)) :- V is FC*E.
  348
  349fractional_constant(FC) -->
  350    digit_sequence(DC1), ".",
  351    digits(DC2),
  352    {   DC2 == []
  353    ->  number_chars(FC, DC1)
  354    ;   append(DC1, [.|DC2], S),
  355        number_chars(FC, S)
  356    }.
  357
  358digit_sequence([D0|DL]) -->
  359    digit(D0), digits(DL).
  360
  361digit_sequence_value(Value) -->
  362    digit_sequence(S),
  363    { number_chars(Value, S) }.
  364
  365hexadecimal_fractional_part(V) -->
  366    hexadecimal_fractional_part(10, 0, V).
  367
  368hexadecimal_fractional_part(I, V0, V) -->
  369    hexadecimal_digit(D), !,
  370    { V1 is V0+D/I,
  371      I2 is I/10
  372    },
  373    hexadecimal_fractional_part(I2, V1, V).
  374hexadecimal_fractional_part(_, V, V) --> [].
  375
  376opt_exponent_part(M, true) -->
  377    exp_e, !,
  378    sign(S),
  379    digit_sequence_value(V),
  380    { M is 10**(S*V) }.
  381opt_exponent_part(1, _) -->
  382    [].
  383
  384binary_exponent_part(M) -->
  385    bin_e,
  386    sign(S),
  387    digit_sequence_value(V),
  388    { M is 10**(S*V) }.
  389
  390exp_e --> "e".
  391exp_e --> "E".
  392
  393bin_e --> "p".
  394bin_e --> "P".
  395
  396sign(-1) --> "-", !.
  397sign(1)  --> "+", !.
  398sign(1)  --> "".
  399
  400floating_suffix(float, true)  --> "f", !.
  401floating_suffix(float, true)  --> "F", !.
  402floating_suffix(double, _) --> "l", !.
  403floating_suffix(double, _) --> "L", !.
  404floating_suffix(double, _) --> "".         % TBD: correct?
 enumeration_constant(-Enum)//
  408enumeration_constant(enum_value(ID)) -->
  409    identifier(ID).
  410
  411character_constant(C) -->
  412    "'", c_char_sequence(V), "'",
  413    { C = char(V) }.
  414character_constant(C) -->
  415    "L'", c_char_sequence(V), "'",
  416    { C = wchar(V) }.
  417
  418c_char_sequence([H|T]) -->
  419    c_char(H),
  420    c_char_sequence_z(T).
  421
  422c_char_sequence_z([H|T]) --> c_char(H), !, c_char_sequence_z(T).
  423c_char_sequence_z([]) --> "".
  424
  425c_char(C) --> [C], { \+ no_c_char(C) }, !.
  426c_char(C) --> escape_sequence(C).
  427
  428no_c_char(0'\').
  429no_c_char(0'\\).
  430no_c_char(0'\n).
  431
  432escape_sequence(C) --> simple_escape_sequence(C).
  433escape_sequence(C) --> octal_escape_sequence(C).
  434escape_sequence(C) --> hexadecimal_escape_sequence(C).
  435escape_sequence(C) --> universal_character_name(C).
  436
  437simple_escape_sequence(0'\') --> "\\'".
  438simple_escape_sequence(0'\") --> "\\\"".
  439simple_escape_sequence(0'?)  --> "\\?".
  440simple_escape_sequence(0'\a) --> "\\a".
  441simple_escape_sequence(0'\b) --> "\\b".
  442simple_escape_sequence(0'\f) --> "\\f".
  443simple_escape_sequence(0'\n) --> "\\n".
  444simple_escape_sequence(0'\r) --> "\\r".
  445simple_escape_sequence(0'\t) --> "\\t".
  446simple_escape_sequence(0'\v) --> "\\v".
  447
  448octal_escape_sequence(C) -->
  449    "\\",
  450    octal_digit(D0),
  451    (   octal_digit(D1)
  452    ->  (   octal_digit(D2)
  453        ->  {number_chars(C, ['0',o,D0,D1,D2])}
  454        ;   {number_chars(C, ['0',o,D0,D1])}
  455        )
  456    ;   {number_chars(C, ['0',o,D0])}
  457    ).
  458
  459hexadecimal_escape_sequence(C) -->
  460    "\\x",
  461    hexadecimal_digits(0, Code),
  462    { char_code(C, Code) }.
  463
  464string_literal(S) -->
  465    "\"", s_char_sequence(Chars), "\"",
  466    sstring_literal_cont(More),
  467    { mkstring(Chars, More, Str),
  468      S = str(Str)
  469    }.
  470string_literal(S) -->
  471    "L\"", s_char_sequence(Chars), "\"",
  472    wstring_literal_cont(More),
  473    { mkstring(Chars, More, Str),
  474      S = wstr(Str)
  475    }.
  476
  477s_char_sequence([H|T]) --> s_char(H), !, s_char_sequence(T).
  478s_char_sequence([]) --> "".
  479
  480s_char(C) --> [C], { \+ no_s_char(C) }, !.
  481s_char(C) --> escape_sequence(C).
  482
  483no_s_char(0'\").
  484no_s_char(0'\\).
  485no_s_char(0'\n).
  486
  487sstring_literal_cont([H|T]) -->
  488    blanks,
  489    "\"", s_char_sequence(Chars), "\"", !,
  490    { string_codes(H, Chars) },
  491    sstring_literal_cont(T).
  492sstring_literal_cont([]) --> "".
  493
  494wstring_literal_cont([H|T]) -->
  495    blanks,
  496    "L\"", s_char_sequence(Chars), "\"", !,
  497    { string_codes(H, Chars) },
  498    wstring_literal_cont(T).
  499wstring_literal_cont([]) --> "".
  500
  501mkstring(Chars, [], Str) :- !,
  502    string_codes(Str, Chars).
  503mkstring(Chars, More, Str) :-
  504    string_codes(Str0, Chars),
  505    atomics_to_string([Str0|More], Str).
 punctuator(-Punct)//
6.4.6
  511punctuator('[') --> "[".
  512punctuator(']') --> "]".
  513punctuator('(') --> "(".
  514punctuator(')') --> ")".
  515punctuator('{') --> "{".
  516punctuator('}') --> "}".
  517punctuator('...') --> "...".
  518punctuator('.') --> ".".
  519punctuator('->') --> "->".
  520punctuator('++') --> "++".
  521punctuator('--') --> "--".
  522punctuator('&=') --> "&=".
  523punctuator('&&') --> "&&".
  524punctuator('&') --> "&".
  525punctuator('*=') --> "*=".
  526punctuator('*') --> "*".
  527punctuator('+=') --> "+=".
  528punctuator('+') --> "+".
  529punctuator('-=') --> "-=".
  530punctuator('-') --> "-".
  531punctuator('~') --> "~".
  532punctuator('!=') --> "!=".
  533punctuator('!') --> "!".
  534punctuator('/=') --> "/=".
  535punctuator('/') --> "/".
  536punctuator('%=') --> "%=".
  537punctuator('%>') --> "%>".
  538punctuator('%:%:') --> "%:%:".
  539punctuator('%:') --> "%:".
  540punctuator('%') --> "%".
  541punctuator('<<=') --> "<<=".
  542punctuator('>>=') --> ">>=".
  543punctuator('<<') --> "<<".
  544punctuator('>>') --> ">>".
  545punctuator('<:') --> "<:".
  546punctuator('<=') --> "<=".
  547punctuator('<%') --> "<%".
  548punctuator('<') --> "<".
  549punctuator(':>') --> ":>".
  550punctuator('>=') --> ">=".
  551punctuator('>') --> ">".
  552punctuator('?') --> "?".
  553punctuator(':') --> ":".
  554punctuator(';') --> ";".
  555punctuator('==') --> "==".
  556punctuator('=') --> "=".
  557punctuator(',') --> ",".
  558punctuator('##') --> "##".
  559punctuator('#') --> "#".
  560punctuator('^=') --> "^=".
  561punctuator('^') --> "^".
  562punctuator('||') --> "||".
  563punctuator('|=') --> "|=".
  564punctuator('|') --> "|".
  565
  566check_punctuators :-			% verify longest matches are first
  567    findall(P, punctuator(P,_,_), L),
  568    (   append(_, [P|T], L),
  569        member(P2, T),
  570        sub_atom(P2, 0, _, _, P),
  571        format('~p should be before ~p~n', [P2, P]),
  572        fail
  573    ;   true
  574    ).
  575
  576:- check_punctuators.  577
  578header_name(Header) -->
  579    "<", string_without(">\n", Codes), ">", !,
  580    { atom_codes(Name, Codes),
  581      Header = header(ab, Name)
  582    }.
  583header_name(Header) -->
  584    "\"", string_without("\"\n", Codes), "\"", !,
  585    { atom_codes(Name, Codes),
  586      Header = header(dq, Name)
  587    }.
  588
  589pp_number(PP) -->
  590    digits(D1a),
  591    (   ".",
  592        digits(D2a)
  593    ->  {append(D1a, [.|D2a], D1)}
  594    ;   {D1 = D1a}
  595    ),
  596    (   identifier_nondigit(NDa)
  597    ->  {D2 = [NDa]}
  598    ;   {D2 = []}
  599    ),
  600    pp_e(D3),
  601    (   "."
  602    ->  {D4 = [.]}
  603    ;   {D4 = []}
  604    ),
  605    { append([D1,D2,D3,D4], D),
  606      string_chars(S, D),
  607      PP = pp(S)
  608    }.
  609
  610pp_e([C1|C2]) -->
  611    pp_ee(C1),
  612    pp_sign(C2).
  613
  614pp_ee(e) --> "e".
  615pp_ee('E') --> "e".
  616pp_ee(p) --> "p".
  617pp_ee('P') --> "P".
  618
  619pp_sign('-') --> "-".
  620pp_sign('+') --> "+".
  621
  622pp_line(pp(Line)) -->
  623    "#", string(Codes), eol, !,
  624    { string_codes(Line, [0'#|Codes]) }.
  625
  626eol --> "\n", !.
  627eol --> eos