1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2017, VU University Amsterdam
    7    All rights reserved.
    8
    9    Redistribution and use in source and binary forms, with or without
   10    modification, are permitted provided that the following conditions
   11    are met:
   12
   13    1. Redistributions of source code must retain the above copyright
   14       notice, this list of conditions and the following disclaimer.
   15
   16    2. Redistributions in binary form must reproduce the above copyright
   17       notice, this list of conditions and the following disclaimer in
   18       the documentation and/or other materials provided with the
   19       distribution.
   20
   21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   32    POSSIBILITY OF SUCH DAMAGE.
   33*/
   34
   35:- module(c99_tokens,
   36          [ c99_tokens//1,              % -List
   37            c99_token//1                % -Token
   38          ]).   39:- use_module(library(dcg/basics),
   40              [ blanks//0, string//1, string_without//2, eos//0]).
 c99_tokens(-Tokens)// is semidet
Tokenize an input according to the C99 rules. Tokens are:
   70c99_tokens(List) -->
   71    c99_token(H), !,
   72    (   {skip_token(H)}
   73    ->  c99_tokens(List)
   74    ;   {List = [H|T]},
   75        c99_tokens(T)
   76    ).
   77c99_tokens([]) -->
   78    blanks.
   79
   80skip_token(pp(_)).
   81
   82c99_token(Token) -->
   83    blanks,
   84    token(Token).
 token(-Token)//
A1: recognise a C99 token.
   90token(T) --> keyword(T), \+ identifier_cont_char(_).
   91token(T) --> identifier(T).
   92token(T) --> constant(T).
   93token(T) --> string_literal(T).
   94token(T) --> pp_line(T).
   95token(T) --> punctuator(T).
   96token(T) --> header_name(T).
   97token(T) --> pp_number(T).
   98
   99keyword(auto)         --> "auto".
  100keyword(break)        --> "break".
  101keyword(case)         --> "case".
  102keyword(char)         --> "char".
  103keyword(const)        --> "const".
  104keyword(continue)     --> "continue".
  105keyword(default)      --> "default".
  106keyword(do)           --> "do".
  107keyword(double)       --> "double".
  108keyword(else)         --> "else".
  109keyword(enum)         --> "enum".
  110keyword(extern)       --> "extern".
  111keyword(float)        --> "float".
  112keyword(for)          --> "for".
  113keyword(goto)         --> "goto".
  114keyword(if)           --> "if".
  115keyword(inline)       --> "inline".
  116keyword(int)          --> "int".
  117keyword(size_t)       --> "size_t".             % Clang on Macos
  118keyword(long)         --> "long".
  119keyword(register)     --> "register".
  120keyword(restrict)     --> "restrict".
  121keyword(return)       --> "return".
  122keyword(short)        --> "short".
  123keyword(signed)       --> "signed".
  124keyword(sizeof)       --> "sizeof".
  125keyword(static)       --> "static".
  126keyword(struct)       --> "struct".
  127keyword(switch)       --> "switch".
  128keyword(typedef)      --> "typedef".
  129keyword(union)        --> "union".
  130keyword(unsigned)     --> "unsigned".
  131keyword(void)         --> "void".
  132keyword(volatile)     --> "volatile".
  133keyword(while)        --> "while".
  134keyword('_Bool')      --> "_Bool".
  135keyword('_Complex')   --> "_Complex".
  136keyword('_Imaginary') --> "_Imaginary".
  137keyword('_Float128')   --> "_Float128".		% GCC
  138keyword('__attribute__') --> "__attribute__".   % GCC
  139keyword('__restrict__') --> "__restrict__".
  140keyword('__restrict__') --> "__restrict".
  141keyword('__extension__') --> "__extension__".
  142keyword(inline) --> "__inline__".
  143keyword(inline) --> "__inline".
  144keyword('__builtin_va_list') --> "__builtin_va_list".
  145keyword('__gnuc_va_list') --> "__gnuc_va_list".
  146keyword('__asm__') --> "__asm__".
  147keyword('__asm__') --> "__asm".
  148keyword('__alignof__') --> "__alignof__".
  149keyword('_Nonnull') --> "_Nonnull".
  150keyword('_Nullable') --> "_Nullable".
  151
  152identifier(Id) --> identifier_nondigit(H), identifier_cont(T),
  153		   {atom_chars(I, [H|T]), Id = id(I)}.
  154
  155identifier_cont([H|T]) -->
  156    identifier_cont_char(H), !,
  157    identifier_cont(T).
  158identifier_cont([]) --> [].
  159
  160identifier_cont_char(H) -->
  161    identifier_nondigit(H), !.
  162identifier_cont_char(H) -->
  163    digit(H).
  164
  165identifier_nondigit(I) --> nondigit(I).
  166identifier_nondigit(I) --> universal_character_name(I).
  167%identifier_nondigit(I) --> other_implementation_defined_characters(I).
  168
  169term_expansion((nondigit(_) --> "_"), Clauses) :-
  170    findall((nondigit(C) --> S), nondigit_code(C, S), Clauses).
  171
  172nondigit_code(Char, [C]) :-
  173    (   C = 0'_ ; between(0'a,0'z,C) ; between(0'A,0'Z,C) ),
  174    char_code(Char, C).
  175
  176nondigit(_) --> "_".
  177
  178digit('0') --> "0".
  179digit('1') --> "1".
  180digit('2') --> "2".
  181digit('3') --> "3".
  182digit('4') --> "4".
  183digit('5') --> "5".
  184digit('6') --> "6".
  185digit('7') --> "7".
  186digit('8') --> "8".
  187digit('9') --> "9".
  188
  189universal_character_name(C) -->
  190    "\\u", hex_quad(V),
  191    { char_code(C, V) }.
  192universal_character_name(C) -->
  193    "\\U", hex_quad(V), hex_quad(W),
  194    { Code is (V<<32) + W, char_code(C, Code) }.
  195
  196hex_quad(V) -->
  197    hexadecimal_digit(X1),
  198    hexadecimal_digit(X2),
  199    hexadecimal_digit(X3),
  200    hexadecimal_digit(X4),
  201    { V is (X1<<12) + (X2<<8) + (X3<<4) + X4 }.
  202
  203
  204constant(C) --> floating_constant(C).
  205constant(C) --> integer_constant(C).
  206constant(C) --> enumeration_constant(C).
  207constant(C) --> character_constant(C).
  208
  209integer_constant(C) -->
  210    decimal_constant(D), opt_integer_suffix(S), {mkic(S,D,C)}.
  211integer_constant(C) -->
  212    hexadecimal_constant(D), opt_integer_suffix(S), {mkic(S,D,C)}.
  213integer_constant(C) -->
  214    octal_constant(D), opt_integer_suffix(S), {mkic(S,D,C)}.
  215
  216mkic(Suffix, Value, Token) :- Token =.. [Suffix,Value].
  217
  218decimal_constant(D) -->
  219    nonzero_digit(D0), digits(DT), { number_chars(D, [D0|DT]) }.
  220
  221octal_constant(D) -->
  222    "0", octal_digits(L), { L == [] -> D = 0 ; number_chars(D, ['0',o|L]) }.
  223
  224digits([H|T]) --> digit(H), !, digits(T).
  225digits([]) --> [].
  226
  227octal_digits([H|T]) --> octal_digit(H), !, octal_digits(T).
  228octal_digits([]) --> [].
  229
  230hexadecimal_constant(D) -->
  231    hexadecimal_prefix,
  232    hexadecimal_digits(0, D).
  233
  234hexadecimal_prefix --> "0x".
  235hexadecimal_prefix --> "0X".
  236
  237hexadecimal_digits(V0, V) -->
  238    hexadecimal_digit(D),
  239    !,
  240    { V1 is V0*16+D },
  241    hexadecimal_digits(V1, V).
  242hexadecimal_digits(V, V) -->
  243    [].
  244
  245nonzero_digit('1') --> "1".
  246nonzero_digit('2') --> "2".
  247nonzero_digit('3') --> "3".
  248nonzero_digit('4') --> "4".
  249nonzero_digit('5') --> "5".
  250nonzero_digit('6') --> "6".
  251nonzero_digit('7') --> "7".
  252nonzero_digit('8') --> "8".
  253nonzero_digit('9') --> "9".
  254
  255octal_digit('0') --> "0".
  256octal_digit('1') --> "1".
  257octal_digit('2') --> "2".
  258octal_digit('3') --> "3".
  259octal_digit('4') --> "4".
  260octal_digit('5') --> "5".
  261octal_digit('6') --> "6".
  262octal_digit('7') --> "7".
  263
  264hexadecimal_digit(0)  --> "0".
  265hexadecimal_digit(1)  --> "1".
  266hexadecimal_digit(2)  --> "2".
  267hexadecimal_digit(3)  --> "3".
  268hexadecimal_digit(4)  --> "4".
  269hexadecimal_digit(5)  --> "5".
  270hexadecimal_digit(6)  --> "6".
  271hexadecimal_digit(7)  --> "7".
  272hexadecimal_digit(8)  --> "8".
  273hexadecimal_digit(9)  --> "9".
  274hexadecimal_digit(10) --> "a".
  275hexadecimal_digit(11) --> "b".
  276hexadecimal_digit(12) --> "c".
  277hexadecimal_digit(13) --> "d".
  278hexadecimal_digit(14) --> "e".
  279hexadecimal_digit(15) --> "f".
  280hexadecimal_digit(10) --> "A".
  281hexadecimal_digit(11) --> "B".
  282hexadecimal_digit(12) --> "C".
  283hexadecimal_digit(13) --> "D".
  284hexadecimal_digit(14) --> "E".
  285hexadecimal_digit(15) --> "F".
 opt_integer_suffix(-Suffix)//
Bind Suffix to one of i, u, l, ll, ul, ull
  291opt_integer_suffix(S) --> unsigned_suffix, long_suffix(L), !, {mkuis(L, S)}.
  292opt_integer_suffix(S) --> long_suffix(L), unsigned_suffix, !, {mkuis(L, S)}.
  293opt_integer_suffix(u) --> unsigned_suffix, !.
  294opt_integer_suffix(L) --> long_suffix(L), !.
  295opt_integer_suffix(i) --> [].
  296
  297mkuis(l, ul).
  298mkuis(ll, ull).
  299
  300unsigned_suffix --> "u".
  301unsigned_suffix --> "U".
  302
  303long_suffix(ll) --> "ll".
  304long_suffix(ll) --> "LL".
  305long_suffix(l)  --> "l".
  306long_suffix(l)  --> "L".
  307
  308floating_constant(F) --> decimal_floating_constant(F).
  309floating_constant(F) --> hexadecimal_floating_constant(F).
  310
  311decimal_floating_constant(F) -->
  312    fractional_constant(FC),
  313    opt_exponent_part(E, _),
  314    floating_suffix(FS, _),
  315    { mkf(FS, FC, E, F) }.
  316decimal_floating_constant(F) -->
  317    digit_sequence_value(FC),
  318    opt_exponent_part(E, Expl),
  319    floating_suffix(FS, Expl),
  320    { Expl == true,
  321      mkf(FS, FC, E, F)
  322    }.
  323
  324hexadecimal_floating_constant(F) -->
  325    hexadecimal_prefix,
  326    (   "."
  327    ->  hexadecimal_fractional_part(FC)
  328    ;   hexadecimal_digits(0, IC),
  329        ".",
  330        hexadecimal_fractional_part(FCP),
  331        { FC is IC+FCP }
  332    ),
  333    binary_exponent_part(E),
  334    floating_suffix(FS, _),
  335    { mkf(FS, FC, E, F) }.
  336
  337mkf(float, FC, E, float(V)) :- V is FC*E.
  338mkf(double, FC, E, double(V)) :- V is FC*E.
  339
  340fractional_constant(FC) -->
  341    digit_sequence(DC1), ".",
  342    digits(DC2),
  343    {   DC2 == []
  344    ->  number_chars(FC, DC1)
  345    ;   append(DC1, [.|DC2], S),
  346        number_chars(FC, S)
  347    }.
  348
  349digit_sequence([D0|DL]) -->
  350    digit(D0), digits(DL).
  351
  352digit_sequence_value(Value) -->
  353    digit_sequence(S),
  354    { number_chars(Value, S) }.
  355
  356hexadecimal_fractional_part(V) -->
  357    hexadecimal_fractional_part(10, 0, V).
  358
  359hexadecimal_fractional_part(I, V0, V) -->
  360    hexadecimal_digit(D), !,
  361    { V1 is V0+D/I,
  362      I2 is I/10
  363    },
  364    hexadecimal_fractional_part(I2, V1, V).
  365hexadecimal_fractional_part(_, V, V) --> [].
  366
  367opt_exponent_part(M, true) -->
  368    exp_e, !,
  369    sign(S),
  370    digit_sequence_value(V),
  371    { M is 10**(S*V) }.
  372opt_exponent_part(1, _) -->
  373    [].
  374
  375binary_exponent_part(M) -->
  376    bin_e,
  377    sign(S),
  378    digit_sequence_value(V),
  379    { M is 10**(S*V) }.
  380
  381exp_e --> "e".
  382exp_e --> "E".
  383
  384bin_e --> "p".
  385bin_e --> "P".
  386
  387sign(-1) --> "-", !.
  388sign(1)  --> "+", !.
  389sign(1)  --> "".
  390
  391floating_suffix(float, true)  --> "f", !.
  392floating_suffix(float, true)  --> "F", !.
  393floating_suffix(double, _) --> "l", !.
  394floating_suffix(double, _) --> "L", !.
  395floating_suffix(double, _) --> "".         % TBD: correct?
 enumeration_constant(-Enum)//
  399enumeration_constant(enum_value(ID)) -->
  400    identifier(ID).
  401
  402character_constant(C) -->
  403    "'", c_char_sequence(V), "'",
  404    { C = char(V) }.
  405character_constant(C) -->
  406    "L'", c_char_sequence(V), "'",
  407    { C = wchar(V) }.
  408
  409c_char_sequence([H|T]) -->
  410    c_char(H),
  411    c_char_sequence_z(T).
  412
  413c_char_sequence_z([H|T]) --> c_char(H), !, c_char_sequence_z(T).
  414c_char_sequence_z([]) --> "".
  415
  416c_char(C) --> [C], { \+ no_c_char(C) }, !.
  417c_char(C) --> escape_sequence(C).
  418
  419no_c_char(0'\').
  420no_c_char(0'\\).
  421no_c_char(0'\n).
  422
  423escape_sequence(C) --> simple_escape_sequence(C).
  424escape_sequence(C) --> octal_escape_sequence(C).
  425escape_sequence(C) --> hexadecimal_escape_sequence(C).
  426escape_sequence(C) --> universal_character_name(C).
  427
  428simple_escape_sequence(0'\') --> "\\'".
  429simple_escape_sequence(0'\") --> "\\\"".
  430simple_escape_sequence(0'?)  --> "\\?".
  431simple_escape_sequence(0'\a) --> "\\a".
  432simple_escape_sequence(0'\b) --> "\\b".
  433simple_escape_sequence(0'\f) --> "\\f".
  434simple_escape_sequence(0'\n) --> "\\n".
  435simple_escape_sequence(0'\r) --> "\\r".
  436simple_escape_sequence(0'\t) --> "\\t".
  437simple_escape_sequence(0'\v) --> "\\v".
  438
  439octal_escape_sequence(C) -->
  440    "\\",
  441    octal_digit(D0),
  442    (   octal_digit(D1)
  443    ->  (   octal_digit(D2)
  444        ->  {number_chars(C, ['0',o,D0,D1,D2])}
  445        ;   {number_chars(C, ['0',o,D0,D1])}
  446        )
  447    ;   {number_chars(C, ['0',o,D0])}
  448    ).
  449
  450hexadecimal_escape_sequence(C) -->
  451    "\\x",
  452    hexadecimal_digits(0, Code),
  453    { char_code(C, Code) }.
  454
  455string_literal(S) -->
  456    "\"", s_char_sequence(Chars), "\"",
  457    sstring_literal_cont(More),
  458    { mkstring(Chars, More, Str),
  459      S = str(Str)
  460    }.
  461string_literal(S) -->
  462    "L\"", s_char_sequence(Chars), "\"",
  463    wstring_literal_cont(More),
  464    { mkstring(Chars, More, Str),
  465      S = wstr(Str)
  466    }.
  467
  468s_char_sequence([H|T]) --> s_char(H), !, s_char_sequence(T).
  469s_char_sequence([]) --> "".
  470
  471s_char(C) --> [C], { \+ no_s_char(C) }, !.
  472s_char(C) --> escape_sequence(C).
  473
  474no_s_char(0'\").
  475no_s_char(0'\\).
  476no_s_char(0'\n).
  477
  478sstring_literal_cont([H|T]) -->
  479    blanks,
  480    "\"", s_char_sequence(Chars), "\"", !,
  481    { string_codes(H, Chars) },
  482    sstring_literal_cont(T).
  483sstring_literal_cont([]) --> "".
  484
  485wstring_literal_cont([H|T]) -->
  486    blanks,
  487    "L\"", s_char_sequence(Chars), "\"", !,
  488    { string_codes(H, Chars) },
  489    wstring_literal_cont(T).
  490wstring_literal_cont([]) --> "".
  491
  492mkstring(Chars, [], Str) :- !,
  493    string_codes(Str, Chars).
  494mkstring(Chars, More, Str) :-
  495    string_codes(Str0, Chars),
  496    atomics_to_string([Str0|More], Str).
 punctuator(-Punct)//
6.4.6
  502punctuator('[') --> "[".
  503punctuator(']') --> "]".
  504punctuator('(') --> "(".
  505punctuator(')') --> ")".
  506punctuator('{') --> "{".
  507punctuator('}') --> "}".
  508punctuator('...') --> "...".
  509punctuator('.') --> ".".
  510punctuator('->') --> "->".
  511punctuator('++') --> "++".
  512punctuator('--') --> "--".
  513punctuator('&=') --> "&=".
  514punctuator('&&') --> "&&".
  515punctuator('&') --> "&".
  516punctuator('*=') --> "*=".
  517punctuator('*') --> "*".
  518punctuator('+=') --> "+=".
  519punctuator('+') --> "+".
  520punctuator('-=') --> "-=".
  521punctuator('-') --> "-".
  522punctuator('~') --> "~".
  523punctuator('!=') --> "!=".
  524punctuator('!') --> "!".
  525punctuator('/=') --> "/=".
  526punctuator('/') --> "/".
  527punctuator('%=') --> "%=".
  528punctuator('%>') --> "%>".
  529punctuator('%:%:') --> "%:%:".
  530punctuator('%:') --> "%:".
  531punctuator('%') --> "%".
  532punctuator('<<=') --> "<<=".
  533punctuator('>>=') --> ">>=".
  534punctuator('<<') --> "<<".
  535punctuator('>>') --> ">>".
  536punctuator('<:') --> "<:".
  537punctuator('<=') --> "<=".
  538punctuator('<%') --> "<%".
  539punctuator('<') --> "<".
  540punctuator(':>') --> ":>".
  541punctuator('>=') --> ">=".
  542punctuator('>') --> ">".
  543punctuator('?') --> "?".
  544punctuator(':') --> ":".
  545punctuator(';') --> ";".
  546punctuator('==') --> "==".
  547punctuator('=') --> "=".
  548punctuator(',') --> ",".
  549punctuator('##') --> "##".
  550punctuator('#') --> "#".
  551punctuator('^=') --> "^=".
  552punctuator('^') --> "^".
  553punctuator('||') --> "||".
  554punctuator('|=') --> "|=".
  555punctuator('|') --> "|".
  556
  557check_punctuators :-			% verify longest matches are first
  558    findall(P, punctuator(P,_,_), L),
  559    (   append(_, [P|T], L),
  560        member(P2, T),
  561        sub_atom(P2, 0, _, _, P),
  562        format('~p should be before ~p~n', [P2, P]),
  563        fail
  564    ;   true
  565    ).
  566
  567:- check_punctuators.  568
  569header_name(Header) -->
  570    "<", string_without(">\n", Codes), ">", !,
  571    { atom_codes(Name, Codes),
  572      Header = header(ab, Name)
  573    }.
  574header_name(Header) -->
  575    "\"", string_without("\"\n", Codes), "\"", !,
  576    { atom_codes(Name, Codes),
  577      Header = header(dq, Name)
  578    }.
  579
  580pp_number(PP) -->
  581    digits(D1a),
  582    (   ".",
  583        digits(D2a)
  584    ->  {append(D1a, [.|D2a], D1)}
  585    ;   {D1 = D1a}
  586    ),
  587    (   identifier_nondigit(NDa)
  588    ->  {D2 = [NDa]}
  589    ;   {D2 = []}
  590    ),
  591    pp_e(D3),
  592    (   "."
  593    ->  {D4 = [.]}
  594    ;   {D4 = []}
  595    ),
  596    { append([D1,D2,D3,D4], D),
  597      string_chars(S, D),
  598      PP = pp(S)
  599    }.
  600
  601pp_e([C1|C2]) -->
  602    pp_ee(C1),
  603    pp_sign(C2).
  604
  605pp_ee(e) --> "e".
  606pp_ee('E') --> "e".
  607pp_ee(p) --> "p".
  608pp_ee('P') --> "P".
  609
  610pp_sign('-') --> "-".
  611pp_sign('+') --> "+".
  612
  613pp_line(pp(Line)) -->
  614    "#", string(Codes), eol, !,
  615    { string_codes(Line, [0'#|Codes]) }.
  616
  617eol --> "\n", !.
  618eol --> eos