src/lisp/lexer.cpp

   1 //  $Id$
   2 //
   3 //  Copyright (C) 2004 Matthias Braun <matze@braunis.de>
   4 //  code in this file based on lispreader from Mark Probst
   5 //
   6 //  This program is free software; you can redistribute it and/or
   7 //  modify it under the terms of the GNU General Public License
   8 //  as published by the Free Software Foundation; either version 2
   9 //  of the License, or (at your option) any later version.
  10 //
  11 //  This program is distributed in the hope that it will be useful,
  12 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 //  GNU General Public License for more details.
  15 //
  16 //  You should have received a copy of the GNU General Public License
  17 //  along with this program; if not, write to the Free Software
  18 //  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  19 #include <config.h>
  20
  21 #include <sstream>
  22 #include <stdexcept>
  23 #include <iostream>
  24
  25 #include "lexer.h"
  26
  27 namespace lisp
  28 {
  29
  30 class EOFException
  31 {
  32 };
  33
  34 Lexer::Lexer(std::istream& newstream)
  35     : stream(newstream), eof(false), linenumber(0)
  36 {
  37   try {
  38     // trigger a refill of the buffer
  39     c = 0;
  40     bufend = 0;
  41     nextChar();
  42   } catch(EOFException& e) {
  43   }
  44 }
  45
  46 Lexer::~Lexer()
  47 {
  48 }
  49
  50 void
  51 Lexer::nextChar()
  52 {
  53   ++c;
  54   if(c >= bufend) {
  55     if(eof)
  56       throw EOFException();
  57     stream.read(buffer, BUFFER_SIZE);
  58     size_t bytes_read = stream.gcount();
  59
  60     c = buffer;
  61     bufend = buffer + bytes_read;
  62
  63     // the following is a hack that appends an additional ' ' at the end of
  64     // the file to avoid problems when parsing symbols/elements and a sudden
  65     // EOF. This is faster than relying on unget and IMO also nicer.
  66     if(bytes_read == 0 || stream.eof()) {
  67       eof = true;
  68       *bufend = ' ';
  69       ++bufend;
  70     }
  71   }
  72 }
  73
  74 Lexer::TokenType
  75 Lexer::getNextToken()
  76 {
  77   static const char* delims = "\"();";
  78
  79   try {
  80     while(isspace(*c)) {
  81       if(*c == '\n')
  82         ++linenumber;
  83       nextChar();
  84     };
  85
  86     token_length = 0;
  87
  88     switch(*c) {
  89       case ';': // comment
  90         while(true) {
  91           nextChar();
  92           if(*c == '\n') {
  93             ++linenumber;
  94             break;
  95           }
  96         }
  97         return getNextToken(); // and again
  98       case '(':
  99         nextChar();
 100         return TOKEN_OPEN_PAREN;
 101       case ')':
 102         nextChar();
 103         return TOKEN_CLOSE_PAREN;
 104       case '"': {  // string
 105         int startline = linenumber;
 106         try {
 107           while(1) {
 108             nextChar();
 109             if(*c == '"')
 110               break;
 111             else if(*c == '\n')
 112               linenumber++;
 113             else if(*c == '\\') {
 114               nextChar();
 115               switch(*c) {
 116                 case 'n':
 117                   *c = '\n';
 118                   break;
 119                 case 't':
 120                   *c = '\t';
 121                   break;
 122               }
 123             }
 124             if(token_length < MAX_TOKEN_LENGTH)
 125               token_string[token_length++] = *c;
 126           }
 127           token_string[token_length] = 0;
 128         } catch(EOFException& ) {
 129           std::stringstream msg;
 130           msg << "Parse error in line " << startline << ": "
 131             << "EOF while parsing string.";
 132           throw std::runtime_error(msg.str());
 133         }
 134         nextChar();
 135         return TOKEN_STRING;
 136       }
 137       case '#': // constant
 138         try {
 139           nextChar();
 140
 141           while(isalnum(*c) || *c == '_') {
 142             if(token_length < MAX_TOKEN_LENGTH)
 143               token_string[token_length++] = *c;
 144             nextChar();
 145           }
 146           token_string[token_length] = 0;
 147         } catch(EOFException& ) {
 148           std::stringstream msg;
 149           msg << "Parse Error in line " << linenumber << ": "
 150             << "EOF while parsing constant.";
 151           throw std::runtime_error(msg.str());
 152         }
 153
 154         if(strcmp(token_string, "t") == 0)
 155           return TOKEN_TRUE;
 156         if(strcmp(token_string, "f") == 0)
 157           return TOKEN_FALSE;
 158
 159         // we only handle #t and #f constants at the moment...
 160
 161         {
 162           std::stringstream msg;
 163           msg << "Parse Error in line " << linenumber << ": "
 164             << "Unknown constant '" << token_string << "'.";
 165           throw std::runtime_error(msg.str());
 166         }
 167
 168       default:
 169         if(isdigit(*c) || *c == '-') {
 170           bool have_nondigits = false;
 171           bool have_digits = false;
 172           int have_floating_point = 0;
 173
 174           do {
 175             if(isdigit(*c))
 176               have_digits = true;
 177             else if(*c == '.')
 178               ++have_floating_point;
 179             else if(isalnum(*c) || *c == '_')
 180               have_nondigits = true;
 181
 182             if(token_length < MAX_TOKEN_LENGTH)
 183               token_string[token_length++] = *c;
 184
 185             nextChar();
 186           } while(!isspace(*c) && !strchr(delims, *c));
 187
 188           token_string[token_length] = 0;
 189
 190           // no nextChar
 191
 192           if(have_nondigits || !have_digits || have_floating_point > 1)
 193             return TOKEN_SYMBOL;
 194           else if(have_floating_point == 1)
 195             return TOKEN_REAL;
 196           else
 197             return TOKEN_INTEGER;
 198         } else {
 199           do {
 200             if(token_length < MAX_TOKEN_LENGTH)
 201               token_string[token_length++] = *c;
 202             nextChar();
 203           } while(!isspace(*c) && !strchr(delims, *c));
 204           token_string[token_length] = 0;
 205
 206           // no nextChar
 207
 208           return TOKEN_SYMBOL;
 209         }
 210     }
 211   } catch(EOFException& ) {
 212     return TOKEN_EOF;
 213   }
 214 }
 215
 216 } // end of namespace lisp
 217