diff options
Diffstat (limited to 'languages/java/java.g')
-rw-r--r-- | languages/java/java.g | 1318 |
1 files changed, 1318 insertions, 0 deletions
diff --git a/languages/java/java.g b/languages/java/java.g new file mode 100644 index 00000000..1f825ec5 --- /dev/null +++ b/languages/java/java.g @@ -0,0 +1,1318 @@ + +header "pre_include_hpp" { + #include "driver.h" + #include "JavaAST.hpp" + + #include <qlistview.h> + #include <kdebug.h> + + #define SET_POSITION(ast,t)\ + { \ + RefJavaAST(ast)->setLine( t->getLine() );\ + RefJavaAST(ast)->setColumn( t->getColumn() ); \ + } +} + +options { + language="Cpp"; +} + +/** Java 1.3 Recognizer + * + * Run 'java Main [-showtree] directory-full-of-java-files' + * + * [The -showtree option pops up a Swing frame that shows + * the AST constructed from the parser.] + * + * Run 'java Main <directory full of java files>' + * + * Contributing authors: + * John Mitchell johnm@non.net + * Terence Parr parrt@magelang.com + * John Lilley jlilley@empathy.com + * Scott Stanchfield thetick@magelang.com + * Markus Mohnen mohnen@informatik.rwth-aachen.de + * Peter Williams pete.williams@sun.com + * Allan Jacobs Allan.Jacobs@eng.sun.com + * Steve Messick messick@redhills.com + * John Pybus john@pybus.org + * + * Version 1.00 December 9, 1997 -- initial release + * Version 1.01 December 10, 1997 + * fixed bug in octal def (0..7 not 0..8) + * Version 1.10 August 1998 (parrt) + * added tree construction + * fixed definition of WS,comments for mac,pc,unix newlines + * added unary plus + * Version 1.11 (Nov 20, 1998) + * Added "shutup" option to turn off last ambig warning. + * Fixed inner class def to allow named class defs as statements + * synchronized requires compound not simple statement + * add [] after builtInType DOT class in primaryExpression + * "const" is reserved but not valid..removed from modifiers + * Version 1.12 (Feb 2, 1999) + * Changed LITERAL_xxx to xxx in tree grammar. + * Updated java.g to use tokens {...} now for 2.6.0 (new feature). + * + * Version 1.13 (Apr 23, 1999) + * Didn't have (stat)? for else clause in tree parser. + * Didn't gen ASTs for interface extends. Updated tree parser too. + * Updated to 2.6.0. + * Version 1.14 (Jun 20, 1999) + * Allowed final/abstract on local classes. + * Removed local interfaces from methods + * Put instanceof precedence where it belongs...in relationalExpr + * It also had expr not type as arg; fixed it. + * Missing ! on SEMI in classBlock + * fixed: (expr) + "string" was parsed incorrectly (+ as unary plus). + * fixed: didn't like Object[].class in parser or tree parser + * Version 1.15 (Jun 26, 1999) + * Screwed up rule with instanceof in it. :( Fixed. + * Tree parser didn't like (expr).something; fixed. + * Allowed multiple inheritance in tree grammar. oops. + * Version 1.16 (August 22, 1999) + * Extending an interface built a wacky tree: had extra EXTENDS. + * Tree grammar didn't allow multiple superinterfaces. + * Tree grammar didn't allow empty var initializer: {} + * Version 1.17 (October 12, 1999) + * ESC lexer rule allowed 399 max not 377 max. + * java.tree.g didn't handle the expression of synchronized + * statements. + * Version 1.18 (August 12, 2001) + * Terence updated to Java 2 Version 1.3 by + * observing/combining work of Allan Jacobs and Steve + * Messick. Handles 1.3 src. Summary: + * o primary didn't include boolean.class kind of thing + * o constructor calls parsed explicitly now: + * see explicitConstructorInvocation + * o add strictfp modifier + * o missing objBlock after new expression in tree grammar + * o merged local class definition alternatives, moved after declaration + * o fixed problem with ClassName.super.field + * o reordered some alternatives to make things more efficient + * o long and double constants were not differentiated from int/float + * o whitespace rule was inefficient: matched only one char + * o add an examples directory with some nasty 1.3 cases + * o made Main.java use buffered IO and a Reader for Unicode support + * o supports UNICODE? + * Using Unicode charVocabulay makes code file big, but only + * in the bitsets at the end. I need to make ANTLR generate + * unicode bitsets more efficiently. + * Version 1.19 (April 25, 2002) + * Terence added in nice fixes by John Pybus concerning floating + * constants and problems with super() calls. John did a nice + * reorg of the primary/postfix expression stuff to read better + * and makes f.g.super() parse properly (it was METHOD_CALL not + * a SUPER_CTOR_CALL). Also: + * + * o "finally" clause was a root...made it a child of "try" + * o Added stuff for asserts too for Java 1.4, but *commented out* + * as it is not backward compatible. + * + * Version 1.20 (October 27, 2002) + * + * Terence ended up reorging John Pybus' stuff to + * remove some nondeterminisms and some syntactic predicates. + * Note that the grammar is stricter now; e.g., this(...) must + * be the first statement. + * + * Trinary ?: operator wasn't working as array name: + * (isBig ? bigDigits : digits)[i]; + * + * Checked parser/tree parser on source for + * Resin-2.0.5, jive-2.1.1, jdk 1.3.1, Lucene, antlr 2.7.2a4, + * and the 110k-line jGuru server source. + * + * Version tracking now done with following ID: + * + * $Id$ + * + * This grammar is in the PUBLIC DOMAIN + */ +class JavaRecognizer extends Parser; +options { + k = 2; // two token lookahead + exportVocab=Java; // Call its vocabulary "Java" + codeGenMakeSwitchThreshold = 2; // Some optimizations + codeGenBitsetTestThreshold = 3; + defaultErrorHandler = true; + buildAST = true; + ASTLabelType = "RefJavaAST"; +} + +tokens { + BLOCK; MODIFIERS; OBJBLOCK; SLIST; CTOR_DEF; METHOD_DEF; VARIABLE_DEF; + INSTANCE_INIT; STATIC_INIT; TYPE; CLASS_DEF; INTERFACE_DEF; + PACKAGE_DEF; ARRAY_DECLARATOR; EXTENDS_CLAUSE; IMPLEMENTS_CLAUSE; + PARAMETERS; PARAMETER_DEF; LABELED_STAT; TYPECAST; INDEX_OP; + POST_INC; POST_DEC; METHOD_CALL; EXPR; ARRAY_INIT; + IMPORT; UNARY_MINUS; UNARY_PLUS; CASE_GROUP; ELIST; FOR_INIT; FOR_CONDITION; + FOR_ITERATOR; EMPTY_STAT; FINAL="final"; ABSTRACT="abstract"; + STRICTFP="strictfp"; SUPER_CTOR_CALL; CTOR_CALL; +} + +{ +private: + Driver* m_driver; + +public: + void setDriver( Driver* d ) { m_driver = d; } + void setFileName( const QString& fileName ) { m_driver->currentFileName() = fileName; } + + void reportError( const ANTLR_USE_NAMESPACE(antlr)RecognitionException& ex ){ + m_driver->addProblem( m_driver->currentFileName(), Problem( QString::fromLocal8Bit(ex.getMessage().c_str()), ex.getLine(), ex.getColumn()) ); + } + + void reportError( const ANTLR_USE_NAMESPACE(std)string& errorMessage ){ + m_driver->addProblem( m_driver->currentFileName(), Problem( QString::fromLocal8Bit(errorMessage.c_str()), LT(1)->getLine(), LT(1)->getColumn()) ); + } + + void reportMessage( const ANTLR_USE_NAMESPACE(std)string& message ){ + m_driver->addProblem( m_driver->currentFileName(), Problem( QString::fromLocal8Bit(message.c_str()), LT(1)->getLine(), LT(1)->getColumn()) ); + } +} + +// Compilation Unit: In Java, this is a single file. This is the start +// rule for this parser +compilationUnit + : // A compilation unit starts with an optional package definition + ( packageDefinition + | /* nothing */ + ) + + // Next we have a series of zero or more import statements + ( importDefinition )* + + // Wrapping things up with any number of class or interface + // definitions + ( typeDefinition )* + + EOF! + ; + +// Package statement: "package" followed by an identifier. +packageDefinition + options {defaultErrorHandler = true;} // let ANTLR handle errors + : p:"package"^ {#p->setType(PACKAGE_DEF);} identifier SEMI! + ; + +// Import statement: import followed by a package or class name +importDefinition + options {defaultErrorHandler = true;} + : i:"import"^ {#i->setType(IMPORT);} identifierStar SEMI! + ; + +// A type definition in a file is either a class or interface definition. +typeDefinition + options {defaultErrorHandler = true;} + : m:modifiers! + ( classDefinition[#m] + | interfaceDefinition[#m] + ) + | SEMI! + ; + +/** A declaration is the creation of a reference or primitive-type variable + * Create a separate Type/Var tree for each var in the var list. + */ +declaration! + : m:modifiers t:typeSpec[false] v:variableDefinitions[#m,#t] + {#declaration = #v;} + ; + +// A type specification is a type name with possible brackets afterwards +// (which would make it an array type). +typeSpec[bool addImagNode] + : classTypeSpec[addImagNode] + | builtInTypeSpec[addImagNode] + ; + +// A class type specification is a class type with possible brackets afterwards +// (which would make it an array type). +classTypeSpec[bool addImagNode] + : identifier (lb:LBRACK^ {#lb->setType(ARRAY_DECLARATOR);} RBRACK!)* + { + if ( addImagNode ) { + #classTypeSpec = #(#[TYPE,"TYPE"], #classTypeSpec); + } + } + ; + +// A builtin type specification is a builtin type with possible brackets +// afterwards (which would make it an array type). +builtInTypeSpec[bool addImagNode] + : builtInType (lb:LBRACK^ {#lb->setType(ARRAY_DECLARATOR);} RBRACK!)* + { + if ( addImagNode ) { + #builtInTypeSpec = #(#[TYPE,"TYPE"], #builtInTypeSpec); + } + } + ; + +// A type name. which is either a (possibly qualified) class name or +// a primitive (builtin) type +type + : identifier + | builtInType + ; + +// The primitive types. +builtInType + : "void" + | "boolean" + | "byte" + | "char" + | "short" + | "int" + | "float" + | "long" + | "double" + ; + +// A (possibly-qualified) java identifier. We start with the first IDENT +// and expand its name by adding dots and following IDENTS +identifier + : IDENT ( DOT^ IDENT )* + ; + +identifierStar + : IDENT + ( DOT^ IDENT )* + ( DOT^ STAR )? + ; + +// A list of zero or more modifiers. We could have used (modifier)* in +// place of a call to modifiers, but I thought it was a good idea to keep +// this rule separate so they can easily be collected in a Vector if +// someone so desires +modifiers + : ( modifier )* + {#modifiers = #([MODIFIERS, "MODIFIERS"], #modifiers);} + ; + +// modifiers for Java classes, interfaces, class/instance vars and methods +modifier + : "private" + | "public" + | "protected" + | "static" + | "transient" + | "final" + | "abstract" + | "native" + | "threadsafe" + | "synchronized" +// | "const" // reserved word, but not valid + | "volatile" + | "strictfp" + ; + +// Definition of a Java class +classDefinition![RefJavaAST modifiers] + : "class" IDENT + // it _might_ have a superclass... + sc:superClassClause + // it might implement some interfaces... + ic:implementsClause + // now parse the body of the class + cb:classBlock + {#classDefinition = #(#[CLASS_DEF,"CLASS_DEF"], + modifiers,IDENT,sc,ic,cb);} + ; + +superClassClause! + : ( "extends" id:identifier )? + {#superClassClause = #(#[EXTENDS_CLAUSE,"EXTENDS_CLAUSE"],id);} + ; + +// Definition of a Java Interface +interfaceDefinition![RefJavaAST modifiers] + : "interface" IDENT + // it might extend some other interfaces + ie:interfaceExtends + // now parse the body of the interface (looks like a class...) + cb:classBlock + {#interfaceDefinition = #(#[INTERFACE_DEF,"INTERFACE_DEF"], + modifiers,IDENT,ie,cb);} + ; + +// This is the body of a class. You can have fields and extra semicolons, +// That's about it (until you see what a field is...) +classBlock + : LCURLY! + ( field | SEMI! )* + RCURLY! + {#classBlock = #([OBJBLOCK, "OBJBLOCK"], #classBlock);} + ; + +// An interface can extend several other interfaces... +interfaceExtends + : ( + e:"extends"! + identifier ( COMMA! identifier )* + )? + {#interfaceExtends = #(#[EXTENDS_CLAUSE,"EXTENDS_CLAUSE"], + #interfaceExtends);} + ; + +// A class can implement several interfaces... +implementsClause + : ( + i:"implements"! identifier ( COMMA! identifier )* + )? + {#implementsClause = #(#[IMPLEMENTS_CLAUSE,"IMPLEMENTS_CLAUSE"], + #implementsClause);} + ; + +// Now the various things that can be defined inside a class or interface... +// Note that not all of these are really valid in an interface (constructors, +// for example), and if this grammar were used for a compiler there would +// need to be some semantic checks to make sure we're doing the right thing... +field! + : // method, constructor, or variable declaration + mods:modifiers + ( h:ctorHead s:constructorBody // constructor + {#field = #(#[CTOR_DEF,"CTOR_DEF"], mods, h, s);} + + | cd:classDefinition[#mods] // inner class + {#field = #cd;} + + | id:interfaceDefinition[#mods] // inner interface + {#field = #id;} + + | t:typeSpec[false] // method or variable declaration(s) + ( IDENT // the name of the method + + // parse the formal parameter declarations. + LPAREN! param:parameterDeclarationList RPAREN! + + rt:declaratorBrackets[#t] + + // get the list of exceptions that this method is + // declared to throw + (tc:throwsClause)? + + ( s2:compoundStatement | SEMI ) + {#field = #(#[METHOD_DEF,"METHOD_DEF"], + mods, + #(#[TYPE,"TYPE"],rt), + IDENT, + param, + tc, + s2);} + | v:variableDefinitions[#mods,#t] SEMI +// {#field = #(#[VARIABLE_DEF,"VARIABLE_DEF"], v);} + {#field = #v;} + ) + ) + + // "static { ... }" class initializer + | "static" s3:compoundStatement + {#field = #(#[STATIC_INIT,"STATIC_INIT"], s3);} + + // "{ ... }" instance initializer + | s4:compoundStatement + {#field = #(#[INSTANCE_INIT,"INSTANCE_INIT"], s4);} + ; + +constructorBody + : lc:LCURLY^ {#lc->setType(SLIST);} + ( options { greedy=true; } : explicitConstructorInvocation)? + (statement)* + RCURLY! + ; + +/** Catch obvious constructor calls, but not the expr.super(...) calls */ +explicitConstructorInvocation + : "this"! lp1:LPAREN^ argList RPAREN! SEMI! + {#lp1->setType(CTOR_CALL);} + | "super"! lp2:LPAREN^ argList RPAREN! SEMI! + {#lp2->setType(SUPER_CTOR_CALL);} + ; + +variableDefinitions[RefJavaAST mods, RefJavaAST t] + : variableDeclarator[(RefJavaAST)getASTFactory()->dupTree((antlr::RefAST)mods), + (RefJavaAST)getASTFactory()->dupTree((antlr::RefAST)t)] + ( COMMA! + variableDeclarator[(RefJavaAST)getASTFactory()->dupTree((antlr::RefAST)mods), + (RefJavaAST)getASTFactory()->dupTree((antlr::RefAST)t)] + )* + ; + +/** Declaration of a variable. This can be a class/instance variable, + * or a local variable in a method + * It can also include possible initialization. + */ +variableDeclarator![RefJavaAST mods, RefJavaAST t] + : id:IDENT d:declaratorBrackets[t] v:varInitializer + {#variableDeclarator = #(#[VARIABLE_DEF,"VARIABLE_DEF"], mods, #(#[TYPE,"TYPE"],d), id, v);} + ; + +declaratorBrackets[RefJavaAST typ] + : {#declaratorBrackets=typ;} + (lb:LBRACK^ {#lb->setType(ARRAY_DECLARATOR);} RBRACK!)* + ; + +varInitializer + : ( ASSIGN^ initializer )? + ; + +// This is an initializer used to set up an array. +arrayInitializer + : lc:LCURLY^ {#lc->setType(ARRAY_INIT);} + ( initializer + ( + // CONFLICT: does a COMMA after an initializer start a new + // initializer or start the option ',' at end? + // ANTLR generates proper code by matching + // the comma as soon as possible. + options { + warnWhenFollowAmbig = false; + } + : + COMMA! initializer + )* + (COMMA!)? + )? + RCURLY! + ; + +// The two "things" that can initialize an array element are an expression +// and another (nested) array initializer. +initializer + : expression + | arrayInitializer + ; + +// This is the header of a method. It includes the name and parameters +// for the method. +// This also watches for a list of exception classes in a "throws" clause. +ctorHead + : IDENT // the name of the method + + // parse the formal parameter declarations. + LPAREN! parameterDeclarationList RPAREN! + + // get the list of exceptions that this method is declared to throw + (throwsClause)? + ; + +// This is a list of exception classes that the method is declared to throw +throwsClause + : "throws"^ identifier ( COMMA! identifier )* + ; + +// A list of formal parameters +parameterDeclarationList + : ( parameterDeclaration ( COMMA! parameterDeclaration )* )? + {#parameterDeclarationList = #(#[PARAMETERS,"PARAMETERS"], + #parameterDeclarationList);} + ; + +// A formal parameter. +parameterDeclaration! + : pm:parameterModifier t:typeSpec[false] id:IDENT + pd:declaratorBrackets[#t] + {#parameterDeclaration = #(#[PARAMETER_DEF,"PARAMETER_DEF"], + pm, #([TYPE,"TYPE"],pd), id);} + ; + +parameterModifier + : (f:"final")? + {#parameterModifier = #(#[MODIFIERS,"MODIFIERS"], f);} + ; + +// Compound statement. This is used in many contexts: +// Inside a class definition prefixed with "static": +// it is a class initializer +// Inside a class definition without "static": +// it is an instance initializer +// As the body of a method +// As a completely indepdent braced block of code inside a method +// it starts a new scope for variable definitions + +compoundStatement + : lc:LCURLY^ {#lc->setType(SLIST);} + // include the (possibly-empty) list of statements + (statement)* + RCURLY! + ; + +statement + // A list of statements in curly braces -- start a new scope! + : compoundStatement + + // declarations are ambiguous with "ID DOT" relative to expression + // statements. Must backtrack to be sure. Could use a semantic + // predicate to test symbol table to see what the type was coming + // up, but that's pretty hard without a symbol table ;) + | (declaration)=> declaration SEMI! + + // An expression statement. This could be a method call, + // assignment statement, or any other expression evaluated for + // side-effects. + | expression SEMI! + + // class definition + | m:modifiers! classDefinition[#m] + + // Attach a label to the front of a statement + | IDENT c:COLON^ {#c->setType(LABELED_STAT);} statement + + // If-else statement + | "if"^ LPAREN! expression RPAREN! statement + ( + // CONFLICT: the old "dangling-else" problem... + // ANTLR generates proper code matching + // as soon as possible. Hush warning. + options { + warnWhenFollowAmbig = false; + } + : + "else"! statement + )? + + // For statement + | "for"^ + LPAREN! + forInit SEMI! // initializer + forCond SEMI! // condition test + forIter // updater + RPAREN! + statement // statement to loop over + + // While statement + | "while"^ LPAREN! expression RPAREN! statement + + // do-while statement + | "do"^ statement "while"! LPAREN! expression RPAREN! SEMI! + + // get out of a loop (or switch) + | "break"^ (IDENT)? SEMI! + + // do next iteration of a loop + | "continue"^ (IDENT)? SEMI! + + // Return an expression + | "return"^ (expression)? SEMI! + + // switch/case statement + | "switch"^ LPAREN! expression RPAREN! LCURLY! + ( casesGroup )* + RCURLY! + + // exception try-catch block + | tryBlock + + // throw an exception + | "throw"^ expression SEMI! + + // synchronize a statement + | "synchronized"^ LPAREN! expression RPAREN! compoundStatement + + // asserts (uncomment if you want 1.4 compatibility) + // | "assert"^ expression ( COLON! expression )? SEMI! + + // empty statement + | s:SEMI {#s->setType(EMPTY_STAT);} + ; + +casesGroup + : ( // CONFLICT: to which case group do the statements bind? + // ANTLR generates proper code: it groups the + // many "case"/"default" labels together then + // follows them with the statements + options { + greedy = true; + } + : + aCase + )+ + caseSList + {#casesGroup = #([CASE_GROUP, "CASE_GROUP"], #casesGroup);} + ; + +aCase + : ("case"^ expression | "default") COLON! + ; + +caseSList + : (statement)* + {#caseSList = #(#[SLIST,"SLIST"],#caseSList);} + ; + +// The initializer for a for loop +forInit + // if it looks like a declaration, it is + : ( (declaration)=> declaration + // otherwise it could be an expression list... + | expressionList + )? + {#forInit = #(#[FOR_INIT,"FOR_INIT"],#forInit);} + ; + +forCond + : (expression)? + {#forCond = #(#[FOR_CONDITION,"FOR_CONDITION"],#forCond);} + ; + +forIter + : (expressionList)? + {#forIter = #(#[FOR_ITERATOR,"FOR_ITERATOR"],#forIter);} + ; + +// an exception handler try/catch block +tryBlock + : "try"^ compoundStatement + (handler)* + ( finallyClause )? + ; + +finallyClause + : "finally"^ compoundStatement + ; + +// an exception handler +handler + : "catch"^ LPAREN! parameterDeclaration RPAREN! compoundStatement + ; + +// expressions +// Note that most of these expressions follow the pattern +// thisLevelExpression : +// nextHigherPrecedenceExpression +// (OPERATOR nextHigherPrecedenceExpression)* +// which is a standard recursive definition for a parsing an expression. +// The operators in java have the following precedences: +// lowest (13) = *= /= %= += -= <<= >>= >>>= &= ^= |= +// (12) ?: +// (11) || +// (10) && +// ( 9) | +// ( 8) ^ +// ( 7) & +// ( 6) == != +// ( 5) < <= > >= +// ( 4) << >> +// ( 3) +(binary) -(binary) +// ( 2) * / % +// ( 1) ++ -- +(unary) -(unary) ~ ! (type) +// [] () (method call) . (dot -- identifier qualification) +// new () (explicit parenthesis) +// +// the last two are not usually on a precedence chart; I put them in +// to point out that new has a higher precedence than '.', so you +// can validy use +// new Frame().show() +// +// Note that the above precedence levels map to the rules below... +// Once you have a precedence chart, writing the appropriate rules as below +// is usually very straightfoward + +// the mother of all expressions +expression + : assignmentExpression + {#expression = #(#[EXPR,"EXPR"],#expression);} + ; + +// This is a list of expressions. +expressionList + : expression (COMMA! expression)* + {#expressionList = #(#[ELIST,"ELIST"], expressionList);} + ; + +// assignment expression (level 13) +assignmentExpression + : conditionalExpression + ( ( ASSIGN^ + | PLUS_ASSIGN^ + | MINUS_ASSIGN^ + | STAR_ASSIGN^ + | DIV_ASSIGN^ + | MOD_ASSIGN^ + | SR_ASSIGN^ + | BSR_ASSIGN^ + | SL_ASSIGN^ + | BAND_ASSIGN^ + | BXOR_ASSIGN^ + | BOR_ASSIGN^ + ) + assignmentExpression + )? + ; + +// conditional test (level 12) +conditionalExpression + : logicalOrExpression + ( QUESTION^ assignmentExpression COLON! conditionalExpression )? + ; + +// logical or (||) (level 11) +logicalOrExpression + : logicalAndExpression (LOR^ logicalAndExpression)* + ; + +// logical and (&&) (level 10) +logicalAndExpression + : inclusiveOrExpression (LAND^ inclusiveOrExpression)* + ; + +// bitwise or non-short-circuiting or (|) (level 9) +inclusiveOrExpression + : exclusiveOrExpression (BOR^ exclusiveOrExpression)* + ; + +// exclusive or (^) (level 8) +exclusiveOrExpression + : andExpression (BXOR^ andExpression)* + ; + +// bitwise or non-short-circuiting and (&) (level 7) +andExpression + : equalityExpression (BAND^ equalityExpression)* + ; + +// equality/inequality (==/!=) (level 6) +equalityExpression + : relationalExpression ((NOT_EQUAL^ | EQUAL^) relationalExpression)* + ; + +// boolean relational expressions (level 5) +relationalExpression + : shiftExpression + ( ( ( LT_^ + | GT^ + | LE^ + | GE^ + ) + shiftExpression + )* + | "instanceof"^ typeSpec[true] + ) + ; + +// bit shift expressions (level 4) +shiftExpression + : additiveExpression ((SL^ | SR^ | BSR^) additiveExpression)* + ; + +// binary addition/subtraction (level 3) +additiveExpression + : multiplicativeExpression ((PLUS^ | MINUS^) multiplicativeExpression)* + ; + +// multiplication/division/modulo (level 2) +multiplicativeExpression + : unaryExpression ((STAR^ | DIV^ | MOD^ ) unaryExpression)* + ; + +unaryExpression + : INC^ unaryExpression + | DEC^ unaryExpression + | MINUS^ {#MINUS->setType(UNARY_MINUS);} unaryExpression + | PLUS^ {#PLUS->setType(UNARY_PLUS);} unaryExpression + | unaryExpressionNotPlusMinus + ; + +unaryExpressionNotPlusMinus + : BNOT^ unaryExpression + | LNOT^ unaryExpression + + | ( // subrule allows option to shut off warnings + options { + // "(int" ambig with postfixExpr due to lack of sequence + // info in linear approximate LL(k). It's ok. Shut up. + generateAmbigWarnings=false; + } + : // If typecast is built in type, must be numeric operand + // Also, no reason to backtrack if type keyword like int, float... + lpb:LPAREN^ {#lpb->setType(TYPECAST);} builtInTypeSpec[true] RPAREN! + unaryExpression + + // Have to backtrack to see if operator follows. If no operator + // follows, it's a typecast. No semantic checking needed to parse. + // if it _looks_ like a cast, it _is_ a cast; else it's a "(expr)" + | (LPAREN classTypeSpec[true] RPAREN unaryExpressionNotPlusMinus)=> + lp:LPAREN^ {#lp->setType(TYPECAST);} classTypeSpec[true] RPAREN! + unaryExpressionNotPlusMinus + + | postfixExpression + ) + ; + +// qualified names, array expressions, method invocation, post inc/dec +postfixExpression + : + /* + "this"! lp1:LPAREN^ argList RPAREN! + {#lp1->setType(CTOR_CALL);} + + | "super"! lp2:LPAREN^ argList RPAREN! + {#lp2->setType(SUPER_CTOR_CALL);} + | + */ + primaryExpression + + ( + /* + options { + // the use of postfixExpression in SUPER_CTOR_CALL adds DOT + // to the lookahead set, and gives loads of false non-det + // warnings. + // shut them off. + generateAmbigWarnings=false; + } + : */ + DOT^ IDENT + ( lp:LPAREN^ {#lp->setType(METHOD_CALL);} + argList + RPAREN! + )? + | DOT^ "this" + + | DOT^ "super" + ( // (new Outer()).super() (create enclosing instance) + lp3:LPAREN^ argList RPAREN! + {#lp3->setType(SUPER_CTOR_CALL);} + | DOT^ IDENT + ( lps:LPAREN^ {#lps->setType(METHOD_CALL);} + argList + RPAREN! + )? + ) + | DOT^ newExpression + | lb:LBRACK^ {#lb->setType(INDEX_OP);} expression RBRACK! + )* + + ( // possibly add on a post-increment or post-decrement. + // allows INC/DEC on too much, but semantics can check + in:INC^ {#in->setType(POST_INC);} + | de:DEC^ {#de->setType(POST_DEC);} + )? + ; + +// the basic element of an expression +primaryExpression + : identPrimary ( options {greedy=true;} : DOT^ "class" )? + | constant + | "true" + | "false" + | "null" + | newExpression + | "this" + | "super" + | LPAREN! assignmentExpression RPAREN! + // look for int.class and int[].class + | builtInType + ( lbt:LBRACK^ {#lbt->setType(ARRAY_DECLARATOR);} RBRACK! )* + DOT^ "class" + ; + +/** Match a, a.b.c refs, a.b.c(...) refs, a.b.c[], a.b.c[].class, + * and a.b.c.class refs. Also this(...) and super(...). Match + * this or super. + */ +identPrimary + : IDENT + ( + options { + // .ident could match here or in postfixExpression. + // We do want to match here. Turn off warning. + greedy=true; + } + : DOT^ IDENT + )* + ( + options { + // ARRAY_DECLARATOR here conflicts with INDEX_OP in + // postfixExpression on LBRACK RBRACK. + // We want to match [] here, so greedy. This overcomes + // limitation of linear approximate lookahead. + greedy=true; + } + : ( lp:LPAREN^ {#lp->setType(METHOD_CALL);} argList RPAREN! ) + | ( options {greedy=true;} : + lbc:LBRACK^ {#lbc->setType(ARRAY_DECLARATOR);} RBRACK! + )+ + )? + ; + +/** object instantiation. + * Trees are built as illustrated by the following input/tree pairs: + * + * new T() + * + * new + * | + * T -- ELIST + * | + * arg1 -- arg2 -- .. -- argn + * + * new int[] + * + * new + * | + * int -- ARRAY_DECLARATOR + * + * new int[] {1,2} + * + * new + * | + * int -- ARRAY_DECLARATOR -- ARRAY_INIT + * | + * EXPR -- EXPR + * | | + * 1 2 + * + * new int[3] + * new + * | + * int -- ARRAY_DECLARATOR + * | + * EXPR + * | + * 3 + * + * new int[1][2] + * + * new + * | + * int -- ARRAY_DECLARATOR + * | + * ARRAY_DECLARATOR -- EXPR + * | | + * EXPR 1 + * | + * 2 + * + */ +newExpression + : "new"^ type + ( LPAREN! argList RPAREN! (classBlock)? + + //java 1.1 + // Note: This will allow bad constructs like + // new int[4][][3] {exp,exp}. + // There needs to be a semantic check here... + // to make sure: + // a) [ expr ] and [ ] are not mixed + // b) [ expr ] and an init are not used together + + | newArrayDeclarator (arrayInitializer)? + ) + ; + +argList + : ( expressionList + | /*nothing*/ + {#argList = #[ELIST,"ELIST"];} + ) + ; + +newArrayDeclarator + : ( + // CONFLICT: + // newExpression is a primaryExpression which can be + // followed by an array index reference. This is ok, + // as the generated code will stay in this loop as + // long as it sees an LBRACK (proper behavior) + options { + warnWhenFollowAmbig = false; + } + : + lb:LBRACK^ {#lb->setType(ARRAY_DECLARATOR);} + (expression)? + RBRACK! + )+ + ; + +constant + : NUM_INT + | CHAR_LITERAL + | STRING_LITERAL + | NUM_FLOAT + | NUM_LONG + | NUM_DOUBLE + ; + +//---------------------------------------------------------------------------- +// The Java scanner +//---------------------------------------------------------------------------- +{ +#include <string> +} +class JavaLexer extends Lexer; + +options { + exportVocab=Java; // call the vocabulary "Java" + testLiterals=false; // don't automatically test for literals + defaultErrorHandler=false; + k=4; // four characters of lookahead +// charVocabulary='\u0003'..'\uFFFF'; + charVocabulary='\u0003'..'\u00FF'; + // without inlining some bitset tests, couldn't do unicode; + // I need to make ANTLR generate smaller bitsets; see + // bottom of JavaLexer.java + codeGenBitsetTestThreshold=20; +} +{ +private: + Driver* m_driver; + +public: + void setDriver( Driver* d ) { m_driver = d; } + void setFileName( const QString& fileName ) { m_driver->currentFileName() = fileName; } + + virtual void reportError( const ANTLR_USE_NAMESPACE(antlr)RecognitionException& ex ){ + m_driver->addProblem( m_driver->currentFileName(), Problem( QString::fromLocal8Bit(ex.getMessage().c_str()), ex.getLine(), ex.getColumn()) ); + } + + virtual void reportError( const ANTLR_USE_NAMESPACE(std)string& errorMessage ){ + m_driver->addProblem( m_driver->currentFileName(), Problem( QString::fromLocal8Bit(errorMessage.c_str()), getLine(), getColumn()) ); + } + + virtual void reportWarning( const ANTLR_USE_NAMESPACE(std)string& warnMessage ){ + m_driver->addProblem( m_driver->currentFileName(), Problem( QString::fromLocal8Bit(warnMessage.c_str()), getLine(), getColumn()) ); + } +} + +// OPERATORS +QUESTION : '?' ; +LPAREN : '(' ; +RPAREN : ')' ; +LBRACK : '[' ; +RBRACK : ']' ; +LCURLY : '{' ; +RCURLY : '}' ; +COLON : ':' ; +COMMA : ',' ; +//DOT : '.' ; +ASSIGN : '=' ; +EQUAL : "==" ; +LNOT : '!' ; +BNOT : '~' ; +NOT_EQUAL : "!=" ; +DIV : '/' ; +DIV_ASSIGN : "/=" ; +PLUS : '+' ; +PLUS_ASSIGN : "+=" ; +INC : "++" ; +MINUS : '-' ; +MINUS_ASSIGN : "-=" ; +DEC : "--" ; +STAR : '*' ; +STAR_ASSIGN : "*=" ; +MOD : '%' ; +MOD_ASSIGN : "%=" ; +SR : ">>" ; +SR_ASSIGN : ">>=" ; +BSR : ">>>" ; +BSR_ASSIGN : ">>>=" ; +GE : ">=" ; +GT : ">" ; +SL : "<<" ; +SL_ASSIGN : "<<=" ; +LE : "<=" ; +LT_ : '<' ; +BXOR : '^' ; +BXOR_ASSIGN : "^=" ; +BOR : '|' ; +BOR_ASSIGN : "|=" ; +LOR : "||" ; +BAND : '&' ; +BAND_ASSIGN : "&=" ; +LAND : "&&" ; +SEMI : ';' ; + +// Whitespace -- ignored +WS : ( ' ' + | '\t' + | '\f' + // handle newlines + | ( options {generateAmbigWarnings=false;} + : "\r\n" // Evil DOS + | '\r' // Macintosh + | '\n' // Unix (the right way) + ) + { newline(); } + )+ + { $setType(ANTLR_USE_NAMESPACE(antlr)Token::SKIP); } + ; + +// Single-line comments +SL_COMMENT + : "//" + (~('\n'|'\r'))* ('\n'|'\r'('\n')?) + { + $setType(ANTLR_USE_NAMESPACE(antlr)Token::SKIP); + newline(); + } + ; + +// multiple-line comments +ML_COMMENT + : "/*" + ( /* '\r' '\n' can be matched in one alternative or by matching + '\r' in one iteration and '\n' in another. I am trying to + handle any flavor of newline that comes in, but the language + that allows both "\r\n" and "\r" and "\n" to all be valid + newline is ambiguous. Consequently, the resulting grammar + must be ambiguous. I'm shutting this warning off. + */ + options { + generateAmbigWarnings=false; + } + : + { LA(2)!='/' }? '*' + | '\r' '\n' {newline();} + | '\r' {newline();} + | '\n' {newline();} + | ~('*'|'\n'|'\r') + )* + "*/" + {$setType(ANTLR_USE_NAMESPACE(antlr)Token::SKIP);} + ; + +// character literals +CHAR_LITERAL + : '\'' ( ESC | ~'\'' ) '\'' + ; + +// string literals +STRING_LITERAL + : '"' (ESC|~('"'|'\\'))* '"' + ; + +// escape sequence -- note that this is protected; it can only be called +// from another lexer rule -- it will not ever directly return a token to +// the parser +// There are various ambiguities hushed in this rule. The optional +// '0'...'9' digit matches should be matched here rather than letting +// them go back to STRING_LITERAL to be matched. ANTLR does the +// right thing by matching immediately; hence, it's ok to shut off +// the FOLLOW ambig warnings. +protected +ESC + : '\\' + ( 'n' + | 'r' + | 't' + | 'b' + | 'f' + | '"' + | '\'' + | '\\' + | ('u')+ HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT + | '0'..'3' + ( + options { + warnWhenFollowAmbig = false; + } + : '0'..'7' + ( + options { + warnWhenFollowAmbig = false; + } + : '0'..'7' + )? + )? + | '4'..'7' + ( + options { + warnWhenFollowAmbig = false; + } + : '0'..'7' + )? + ) + ; + +// hexadecimal digit (again, note it's protected!) +protected +HEX_DIGIT + : ('0'..'9'|'A'..'F'|'a'..'f') + ; + +// a dummy rule to force vocabulary to be all characters (except special +// ones that ANTLR uses internally (0 to 2) +protected +VOCAB + : '\3'..'\377' + ; + +// an identifier. Note that testLiterals is set to true! This means +// that after we match the rule, we look in the literals table to see +// if it's a literal or really an identifer +IDENT + options {testLiterals=true;} + : ('a'..'z'|'A'..'Z'|'_'|'$') ('a'..'z'|'A'..'Z'|'_'|'0'..'9'|'$')* + ; + +// a numeric literal +NUM_INT + { + bool isDecimal = false; + ANTLR_USE_NAMESPACE(antlr)RefToken t = ANTLR_USE_NAMESPACE(antlr)nullToken; + } + : '.' {_ttype = DOT;} + ( ('0'..'9')+ (EXPONENT)? (f1:FLOAT_SUFFIX {t=f1;})? + { + if ( t && + (t->getText().find('f') != ANTLR_USE_NAMESPACE(std)string::npos || + t->getText().find('F') != ANTLR_USE_NAMESPACE(std)string::npos ) ) { + _ttype = NUM_FLOAT; + } + else { + _ttype = NUM_DOUBLE; // assume double + } + } + )? + + | ( '0' {isDecimal = true;} // special case for just '0' + ( ('x'|'X') + ( // hex + // the 'e'|'E' and float suffix stuff look + // like hex digits, hence the (...)+ doesn't + // know when to stop: ambig. ANTLR resolves + // it correctly by matching immediately. It + // is therefor ok to hush warning. + options { + warnWhenFollowAmbig=false; + } + : HEX_DIGIT + )+ + | ('0'..'7')+ // octal + )? + | ('1'..'9') ('0'..'9')* {isDecimal=true;} // non-zero decimal + ) + ( ('l'|'L') { _ttype = NUM_LONG; } + + // only check to see if it's a float if looks like decimal so far + | {isDecimal}? + ( '.' ('0'..'9')* (EXPONENT)? (f2:FLOAT_SUFFIX {t=f2;})? + | EXPONENT (f3:FLOAT_SUFFIX {t=f3;})? + | f4:FLOAT_SUFFIX {t=f4;} + ) + { + if ( t && + (t->getText().find('f') != ANTLR_USE_NAMESPACE(std)string::npos || + t->getText().find('F') != ANTLR_USE_NAMESPACE(std)string::npos ) ) { + _ttype = NUM_FLOAT; + } + else { + _ttype = NUM_DOUBLE; // assume double + } + } + )? + ; + +// a couple protected methods to assist in matching floating point numbers +protected +EXPONENT + : ('e'|'E') ('+'|'-')? ('0'..'9')+ + ; + +protected +FLOAT_SUFFIX + : 'f'|'F'|'d'|'D' + ; + |