View Javadoc
1   /*
2    * $Header$
3    * $Revision$
4    * $Date$
5    *
6    * ====================================================================
7    *
8    * Copyright 2000-2002 bob mcwhirter & James Strachan.
9    * All rights reserved.
10   *
11   *
12   * Redistribution and use in source and binary forms, with or without
13   * modification, are permitted provided that the following conditions are
14   * met:
15   * 
16   *   * Redistributions of source code must retain the above copyright
17   *     notice, this list of conditions and the following disclaimer.
18   * 
19   *   * Redistributions in binary form must reproduce the above copyright
20   *     notice, this list of conditions and the following disclaimer in the
21   *     documentation and/or other materials provided with the distribution.
22   * 
23   *   * Neither the name of the Jaxen Project nor the names of its
24   *     contributors may be used to endorse or promote products derived 
25   *     from this software without specific prior written permission.
26   * 
27   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
28   * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29   * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
30   * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
31   * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32   * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33   * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34   * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35   * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36   * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37   * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38   *
39   * ====================================================================
40   * This software consists of voluntary contributions made by many
41   * individuals on behalf of the Jaxen Project and was originally
42   * created by bob mcwhirter <bob@werken.com> and
43   * James Strachan <jstrachan@apache.org>.  For more information on the
44   * Jaxen Project, please see <http://www.jaxen.org/>.
45   *
46   * $Id$
47   */
48  
49  package org.jaxen.saxpath.base;
50  
51  class XPathLexer
52  {
53      private String xpath;
54      private int    currentPosition;
55      private int    endPosition;
56      private boolean expectOperator = false;
57  
58      XPathLexer(String xpath)
59      {
60          setXPath( xpath );
61      }
62  
63      private void setXPath(String xpath)
64      {
65          this.xpath           = xpath;
66          this.currentPosition = 0;
67          this.endPosition     = xpath.length();
68      }
69  
70      String getXPath()
71      {
72          return this.xpath;
73      }
74  
75      Token nextToken()
76      {
77          Token token = null;
78  
79          do
80          {
81              token = null;
82  
83              switch ( LA(1) )
84              {
85                  case '$':
86                  {
87                      token = dollar();
88                      break;
89                  }
90                      
91                  case '"':
92                  case '\'':
93                  {
94                      token = literal();
95                      break;
96                  }
97                      
98                  case '/':
99                  {
100                     token = slashes();
101                     break;
102                 }
103 
104                 case ',':
105                 {
106                     token = comma();
107                     break;
108                 }
109                     
110                 case '(':
111                 {
112                     token = leftParen();
113                     break;
114                 }
115                     
116                 case ')':
117                 {
118                     token = rightParen();
119                     break;
120                 }
121                     
122                 case '[':
123                 {
124                     token = leftBracket();
125                     break;
126                 }
127                     
128                 case ']':
129                 {
130                     token = rightBracket();
131                     break;
132                 }
133                     
134                 case '+':
135                 {
136                     token = plus();
137                     break;
138                 }
139                     
140                 case '-':
141                 {
142                     token = minus();
143                     break;
144                 }
145                     
146                 case '<':
147                 case '>':
148                 {
149                     token = relationalOperator();
150                     break;
151                 }        
152 
153                 case '=':
154                 {
155                     token = equals();
156                     break;
157                 }
158                     
159                 case '!':
160                 {
161                     if ( LA(2) == '=' )
162                     {
163                         token = notEquals();
164                     }
165                     break;
166                 }
167                     
168                 case '|':
169                 {
170                     token = pipe();
171                     break;
172                 }
173                     
174                 case '@':
175                 {
176                     token = at();
177                     break;
178                 }
179                     
180                 case ':':
181                 {
182                     if ( LA(2) == ':' )
183                     {
184                         token = doubleColon();
185                     }
186                     else
187                     {
188                         token = colon();
189                     }
190                     break;
191                 }
192                     
193                 case '*':
194                 {
195                     token = star();
196                     break;
197                 }
198                     
199                 case '.':
200                 {
201                     switch ( LA(2) )
202                     {
203                         case '0':
204                         case '1':
205                         case '2':
206                         case '3':
207                         case '4':
208                         case '5':
209                         case '6':
210                         case '7':
211                         case '8':
212                         case '9':
213                         {
214                             token = number();
215                             break;
216                         }
217                         default:
218                         {
219                             token = dots();
220                             break;
221                         }
222                     }
223                     break;
224                 }
225 
226                 case '0':
227                 case '1':
228                 case '2':
229                 case '3':
230                 case '4':
231                 case '5':
232                 case '6':
233                 case '7':
234                 case '8':
235                 case '9':
236                 {
237                     token = number();
238                     break;
239                 }
240 
241                 case ' ':
242                 case '\t':
243                 case '\n':
244                 case '\r':
245                 {
246                     token = whitespace();
247                     break;
248                 }
249                     
250                 default:
251                 {
252                     if ( Verifier.isXMLNCNameStartCharacter( LA(1) ) )
253                     {
254                         token = identifierOrOperatorName();
255                     }
256                 }
257             }
258 
259             if ( token == null )
260             {
261                 if (!hasMoreChars())
262                 {
263                     token = new Token( TokenTypes.EOF,
264                                    getXPath(),
265                                    this.currentPosition,
266                                    this.endPosition );
267             }
268                 else
269                 {
270                     token = new Token( TokenTypes.ERROR,
271                                    getXPath(),
272                                    this.currentPosition,
273                                    this.endPosition );
274                 }
275             }
276 
277         }
278         while (token.getTokenType() == TokenTypes.SKIP );
279         
280         // For some reason, section 3.7, Lexical structure,
281         // doesn't seem to feel like it needs to mention the
282         // SLASH, DOUBLE_SLASH, and COLON tokens for the test
283         // if an NCName is an operator or not.
284         //
285         // According to section 3.7, "/foo" should be considered
286         // as a SLASH following by an OperatorName being 'foo'.
287         // Which is just simply, clearly, wrong, in my mind.
288         //
289         //     -bob
290 
291         switch ( token.getTokenType() )
292         {
293             case TokenTypes.AT:
294             case TokenTypes.DOUBLE_COLON:
295             case TokenTypes.LEFT_PAREN:
296             case TokenTypes.LEFT_BRACKET:
297             case TokenTypes.AND:
298             case TokenTypes.OR:
299             case TokenTypes.MOD:
300             case TokenTypes.DIV:
301             case TokenTypes.COLON:
302             case TokenTypes.SLASH:
303             case TokenTypes.DOUBLE_SLASH:
304             case TokenTypes.PIPE:
305             case TokenTypes.DOLLAR:
306             case TokenTypes.PLUS:
307             case TokenTypes.MINUS:
308             case TokenTypes.STAR_OPERATOR:
309             case TokenTypes.COMMA:
310             case TokenTypes.LESS_THAN_SIGN:
311             case TokenTypes.GREATER_THAN_SIGN:
312             case TokenTypes.LESS_THAN_OR_EQUALS_SIGN:
313             case TokenTypes.GREATER_THAN_OR_EQUALS_SIGN:
314             case TokenTypes.EQUALS:
315             case TokenTypes.NOT_EQUALS:
316             {
317                 expectOperator = false;
318                 break;
319             }
320             default:
321             {
322                 expectOperator = true;
323                 break;
324             }
325         }
326 
327          return token;
328      }
329 
330     private Token identifierOrOperatorName()
331     {
332         Token token = null;
333         if ( expectOperator ) {
334             token = operatorName();
335         } else {
336             token = identifier();
337         }
338         return token;
339     }
340     
341     private Token identifier()
342     {
343         Token token = null;
344     
345         int start = this.currentPosition;
346     
347         while ( hasMoreChars() )
348         {
349             if ( Verifier.isXMLNCNameCharacter( LA(1) ) )
350             {
351                 consume();
352             }
353             else
354             {
355                 break;
356             }
357         }
358     
359         token = new Token( TokenTypes.IDENTIFIER,
360                            getXPath(),
361                            start,
362                            this.currentPosition );
363     
364         return token;
365     }
366     
367     private Token operatorName()
368     {
369         Token token = null;
370     
371         switch ( LA(1) )
372         {
373             case 'a':
374             {
375                 token = and();
376                 break;
377             }
378     
379             case 'o':
380             {
381                 token = or();
382                 break;
383             }
384     
385             case 'm':
386             {
387                 token = mod();
388                 break;
389             }
390     
391             case 'd':
392             {
393                 token = div();
394                 break;
395             }
396         }
397     
398         return token;
399     }
400     
401     private Token mod()
402     {
403         Token token = null;
404     
405         if ( ( LA(1) == 'm' )
406              &&
407              ( LA(2) == 'o' )
408              &&
409              ( LA(3) == 'd' )
410            )
411         {
412             token = new Token( TokenTypes.MOD,
413                                getXPath(),
414                                this.currentPosition,
415                                this.currentPosition+3 );
416     
417             consume();
418             consume();
419             consume();
420         }
421     
422         return token;
423     }
424     
425     private Token div()
426     {
427         Token token = null;
428     
429         if ( ( LA(1) == 'd' )
430              &&
431              ( LA(2) == 'i' )
432              &&
433              ( LA(3) == 'v' )
434             )
435         {
436             token = new Token( TokenTypes.DIV,
437                                getXPath(),
438                                this.currentPosition,
439                                this.currentPosition+3 );
440     
441             consume();
442             consume();
443             consume();
444         }
445     
446         return token;
447     }
448     
449     private Token and()
450     {
451         Token token = null;
452     
453         if ( ( LA(1) == 'a' )
454              &&
455              ( LA(2) == 'n' )
456              &&
457              ( LA(3) == 'd' )
458            )
459         {
460             token = new Token( TokenTypes.AND,
461                                getXPath(),
462                                this.currentPosition,
463                                this.currentPosition+3 );
464     
465             consume();
466             consume();
467             consume();
468         }
469     
470         return token;
471     }
472     
473     private Token or()
474     {
475         Token token = null;
476     
477         if ( ( LA(1) == 'o' )
478              &&
479              ( LA(2) == 'r' )
480            )
481         {
482             token = new Token( TokenTypes.OR,
483                                getXPath(),
484                                this.currentPosition,
485                                this.currentPosition+2 );
486     
487             consume();
488             consume();
489         }
490     
491         return token;
492     }
493     
494     private Token number()
495     {
496         int     start         = this.currentPosition;
497         boolean periodAllowed = true;
498     
499       loop:
500         while( true )
501         {
502             switch ( LA(1) )
503             {
504                 case '.':
505                     if ( periodAllowed )
506                     {
507                         periodAllowed = false;
508                         consume();
509                     }
510                     else
511                     {
512                         break loop;
513                     }
514                     break;
515                 case '0':
516                 case '1':
517                 case '2':
518                 case '3':
519                 case '4':
520                 case '5':
521                 case '6':
522                 case '7':
523                 case '8':
524                 case '9':
525                     consume();
526                     break;
527                 default:
528                     break loop;
529             }
530         }
531     
532         return new Token( TokenTypes.DOUBLE,
533                                getXPath(),
534                                start,
535                                this.currentPosition );
536     }
537     
538     private Token whitespace()
539     {
540         consume();
541             
542       loop:
543         while( hasMoreChars() )
544         {
545             switch ( LA(1) )
546             {
547                 case ' ':
548                 case '\t':
549                 case '\n':
550                 case '\r':
551                 {
552                     consume();
553                     break;
554                 }
555                     
556                 default:
557                 {
558                     break loop;
559                 }
560             }
561         }
562     
563         return new Token( TokenTypes.SKIP,
564                           getXPath(),
565                           0,
566                           0 );
567     }
568     
569     private Token comma()
570     {
571         Token token = new Token( TokenTypes.COMMA,
572                                  getXPath(),
573                                  this.currentPosition,
574                                  this.currentPosition+1 );
575     
576         consume();
577     
578         return token;
579     }
580     
581     private Token equals()
582     {
583         Token token = new Token( TokenTypes.EQUALS,
584                                  getXPath(),
585                                  this.currentPosition,
586                                  this.currentPosition+1 );
587     
588         consume();
589     
590         return token;
591     }
592     
593     private Token minus()
594     {
595         Token token = new Token( TokenTypes.MINUS,
596                                  getXPath(),
597                                  this.currentPosition,
598                                  this.currentPosition+1 );
599         consume();
600             
601         return token;
602     }
603     
604     private Token plus()
605     {
606         Token token = new Token( TokenTypes.PLUS,
607                                  getXPath(),
608                                  this.currentPosition,
609                                  this.currentPosition+1 );
610         consume();
611     
612         return token;
613     }
614     
615     private Token dollar()
616     {
617         Token token = new Token( TokenTypes.DOLLAR,
618                                  getXPath(),
619                                  this.currentPosition,
620                                  this.currentPosition+1 );
621         consume();
622     
623         return token;
624     }
625     
626     private Token pipe()
627     {
628         Token token = new Token( TokenTypes.PIPE,
629                                  getXPath(),
630                                  this.currentPosition,
631                                  this.currentPosition+1 );
632     
633         consume();
634     
635         return token;
636     }
637     
638     private Token at()
639     {
640         Token token = new Token( TokenTypes.AT,
641                                  getXPath(),
642                                  this.currentPosition,
643                                  this.currentPosition+1 );
644     
645         consume();
646     
647         return token;
648     }
649     
650     private Token colon()
651     {
652         Token token = new Token( TokenTypes.COLON,
653                                  getXPath(),
654                                  this.currentPosition,
655                                  this.currentPosition+1 );
656         consume();
657     
658         return token;
659     }
660     
661     private Token doubleColon()
662     {
663         Token token = new Token( TokenTypes.DOUBLE_COLON,
664                                  getXPath(),
665                                  this.currentPosition,
666                                  this.currentPosition+2 );
667     
668         consume();
669         consume();
670     
671         return token;
672     }
673     
674     private Token notEquals()
675     {
676         Token token = new Token( TokenTypes.NOT_EQUALS,
677                                  getXPath(),
678                                  this.currentPosition,
679                                  this.currentPosition + 2 );
680     
681         consume();
682         consume();
683     
684         return token;
685     }
686     
687     private Token relationalOperator()
688     {
689         Token token = null;
690     
691         switch ( LA(1) )
692         {
693             case '<':
694             {
695                 if ( LA(2) == '=' )
696                 {
697                     token = new Token( TokenTypes.LESS_THAN_OR_EQUALS_SIGN,
698                                        getXPath(),
699                                        this.currentPosition,
700                                        this.currentPosition + 2 );
701                     consume();
702                 }
703                 else
704                 {
705                     token = new Token( TokenTypes.LESS_THAN_SIGN,
706                                        getXPath(),
707                                        this.currentPosition,
708                                        this.currentPosition + 1);
709                 }
710     
711                 consume();
712                 break;
713             }
714             case '>':
715             {
716                 if ( LA(2) == '=' )
717                 {
718                     token = new Token( TokenTypes.GREATER_THAN_OR_EQUALS_SIGN,
719                                        getXPath(),
720                                        this.currentPosition,
721                                        this.currentPosition + 2 );
722                     consume();
723                 }
724                 else
725                 {
726                     token = new Token( TokenTypes.GREATER_THAN_SIGN,
727                                        getXPath(),
728                                        this.currentPosition,
729                                        this.currentPosition + 1 );
730                 }
731     
732                 consume();
733                 break;
734             }
735         }
736     
737         return token;
738                 
739     }
740     
741     // ????
742     private Token star()
743     {
744         int tokenType = expectOperator ? TokenTypes.STAR_OPERATOR : TokenTypes.STAR;
745          Token token = new Token( tokenType,
746                          getXPath(),
747                          this.currentPosition,
748                          this.currentPosition+1 );
749     
750         consume();
751             
752         return token;
753     }
754     
755     private Token literal()
756     {
757         Token token = null;
758     
759         char match  = LA(1);
760     
761         consume();
762     
763         int start = this.currentPosition;
764             
765         while ( ( token == null )
766                 &&
767                 hasMoreChars() )
768         {
769             if ( LA(1) == match )
770             {
771                 token = new Token( TokenTypes.LITERAL,
772                                    getXPath(),
773                                    start,
774                                    this.currentPosition );
775             }
776             consume();
777         }
778     
779         return token;
780     }
781     
782     private Token dots()
783     {
784         Token token = null;
785     
786         switch ( LA(2) )
787         {
788             case '.':
789             {
790                 token = new Token( TokenTypes.DOT_DOT,
791                                    getXPath(),
792                                    this.currentPosition,
793                                    this.currentPosition+2 ) ;
794                 consume();
795                 consume();
796                 break;
797             }
798             default:
799             {
800                 token = new Token( TokenTypes.DOT,
801                                    getXPath(),
802                                    this.currentPosition,
803                                    this.currentPosition+1 );
804                 consume();
805                 break;
806             }
807         }
808     
809         return token;
810     }
811     
812     private Token leftBracket()
813     {
814         Token token = new Token( TokenTypes.LEFT_BRACKET,
815                                  getXPath(),
816                                  this.currentPosition,
817                                  this.currentPosition+1 );
818     
819         consume();
820     
821         return token;
822     }
823     
824     private Token rightBracket()
825     {
826         Token token = new Token( TokenTypes.RIGHT_BRACKET,
827                                  getXPath(),
828                                  this.currentPosition,
829                                  this.currentPosition+1 );
830     
831         consume();
832     
833         return token;
834     }
835     
836     private Token leftParen()
837     {
838         Token token = new Token( TokenTypes.LEFT_PAREN,
839                                  getXPath(),
840                                  this.currentPosition,
841                                  this.currentPosition+1 );
842     
843         consume();
844     
845         return token;
846     }
847     
848     private Token rightParen()
849     {
850         Token token = new Token( TokenTypes.RIGHT_PAREN,
851                                  getXPath(),
852                                  this.currentPosition,
853                                  this.currentPosition+1 );
854     
855         consume();
856     
857         return token;
858     }
859     
860     private Token slashes()
861     {
862         Token token = null;
863     
864         switch ( LA(2) )
865         {
866             case '/':
867             {
868                 token = new Token( TokenTypes.DOUBLE_SLASH,
869                                    getXPath(),
870                                    this.currentPosition,
871                                    this.currentPosition+2 );
872                 consume();
873                 consume();
874                 break;
875             }
876             default:
877             {
878                 token = new Token( TokenTypes.SLASH,
879                                    getXPath(),
880                                    this.currentPosition,
881                                    this.currentPosition+1 );
882                 consume();
883             }
884         }
885     
886         return token;
887     }
888     
889     private char LA(int i) 
890     {
891         if ( currentPosition + ( i - 1 ) >= this.endPosition )
892         {
893             return (char) -1;
894         }
895     
896         return getXPath().charAt( this.currentPosition + (i - 1) );
897     }
898     
899     private void consume()
900     {
901         ++this.currentPosition;
902     }
903     
904     private boolean hasMoreChars()
905     {
906         return this.currentPosition < this.endPosition;
907     }
908 
909 }