"\\\n" { /* code to continue a line, no logical LINE token */ }
"\n" { return NEWLINE; }
^[ \t]+ { /* whitespace at the start of a line, calc indents */ }
[ \t]+ { /* whitespace not at the start of a line, discard */ }
int indentlevel(char *s)
{
column = 1;
while(*s != '\0') {
if (*s == '\t') {
while ((column % 8) != 0) column++;
}
else column++;
s++;
}
return column;
}
int yylex()
{
/* if saved tokens are on a saved token (DEDENT) stack, return top one */
/* else return yylex2() */
}
| Token | Meaning/Comment |
|---|---|
| ENDMARKER | Some grammars expect a token upon seeing EOF! |
| NAME | identifier |
| NUMBER | what, python doesn't distinguish reals from integers??! From realpython we note that numbers can have _ in them, as in 1_000. Leading - seems to be part of the number. Real numbers include scientific notation, like 1e6. Exponent can have + or - after the e or E. |
| STRING | all kinds ' " and ''' |
| NEWLINE | LOGICAL newline |
| INDENT | this line's initial whitespace larger than previous line |
| DEDENT | this line's initial whitespace smaller than previous line |
| LPAR | '(' |
| RPAR | ')' |
| LSQB | '[' |
| RSQB | ']' |
| COLON | ':' |
| COMMA | ',' |
| SEMI | ';' |
| PLUS | '+' |
| MINUS | '-' |
| STAR | '*' |
| SLASH | '/' |
| VBAR | '|' |
| AMPER | '&' |
| LESS | '<' |
| GREATER | '>' |
| EQUAL | '=' |
| DOT | '.' |
| PERCENT | '%' |
| LBRACE | '{' |
| RBRACE | '}' |
| EQEQUAL | '==' |
| NOTEQUAL | '!=' |
| LESSEQUAL | '<=' |
| GREATEREQUAL | '>=' |
| TILDE | '~' |
| CIRCUMFLEX | '^' |
| LEFTSHIFT | '<<' |
| RIGHTSHIFT | '>>' |
| DOUBLESTAR | '**' |
| PLUSEQUAL | '+=' |
| MINEQUAL | '-=' |
| STAREQUAL | '*=' |
| SLASHEQUAL | '/=' |
| PERCENTEQUAL | '%=' |
| AMPEREQUAL | '&=' |
| VBAREQUAL | '|=' |
| CIRCUMFLEXEQUAL | '^=' |
| LEFTSHIFTEQUAL | '<<=' |
| RIGHTSHIFTEQUAL | '>>=' |
| DOUBLESTAREQUAL | '**=' |
| DOUBLESLASH | '//' |
| DOUBLESLASHEQUAL | '//=' |
| AT | '@' |
| ATEQUAL | '@=' |
| RARROW | '->' |
| ELLIPSIS | '...' |
| COLONEQUAL | ':=' |
| OP | don't know what this is yet. Grammar does not have it. |
| AWAIT | I guess this is a not-in-PunY new reserved word |
| ASYNC | I guess this is a not-in-PunY new reserved word |
| TYPE_IGNORE | # type: ignore, special comment, not in PunY |
| TYPE_COMMENT | e.g. # type:(str) -> str, not in PunY |
| ERRORTOKEN | turn lexical error into a syntax error |
import_from: ('from' (('.' | '...')* dotted_name | ('.' | '...')+)
'import' ('*' | '(' import_as_names ')' | import_as_names))
They say: # note below: the ('.' | '...') is necessary because '..' is
tokenized as ELLIPSIS. So should I put '...' as it is or should I add a
token called ELLIPSIS which takes in 3 dots?
In Bison I would translate that roughly as:
compound_stmt:
if_stmt
| while_stmt
| for_stmt
| try_stmt { not_puny("try statement"); }
| with_stmt { not_puny("try statement"); }
| funcdef
| classdef
| decorated { not_puny("decorated statement"); }
| async_stmt { not_puny("async statement"); }
;
struct type {
/*
* Integer code that says what kind of type this is.
* Includes all primitive types:
* 1 = int, 2=float, 3=string, 4=bool,
* Also includes codes for compound types that then also
* hold type information in a supporting union...
* 5=list, 6=dict., 7=func, 8=class */
int base_type;
/* gone away! for PunY */
union {
struct funcdef {
struct type *return_type;
int nparams;
struct params **p;
} f;
/* maybe we can get away with just "knowing" for only predefined class info
struct classdef {
struct methods **meth;
struct members **mem;
} f;
*/
} u;
}
struct field { /* members (fields) of structs */
char *name;
struct type *elemtype;
}