From aee6c4dede5a9c4fa0f7eb134fba61fef738a573 Mon Sep 17 00:00:00 2001 From: Mathnerd314 Date: Tue, 20 Jan 2009 23:49:55 +0000 Subject: [PATCH] Rewrite of tinygettext's POFileReader to look more like a combination of lisp/lexer and lisp/parser, removing need for all those ugly State variables. Also move line numbering to nextChar in lexer.cpp and add a method addChar() to append to the buffer. Should compile, but I (using my English-only system) haven't tested it completely. SVN-Revision: 5829 --- src/lisp/lexer.cpp | 39 +++-- src/lisp/lexer.hpp | 1 + src/tinygettext/tinygettext.cpp | 324 +++++++++++++++++++--------------------- 3 files changed, 170 insertions(+), 194 deletions(-) diff --git a/src/lisp/lexer.cpp b/src/lisp/lexer.cpp index e1a4220f5..451cb9aff 100644 --- a/src/lisp/lexer.cpp +++ b/src/lisp/lexer.cpp @@ -65,6 +65,16 @@ Lexer::nextChar() } } c = *bufpos++; + if(c == '\n') + ++linenumber; +} + +void +Lexer::addChar() +{ + if(token_length < MAX_TOKEN_LENGTH) + token_string[token_length++] = c; + nextChar(); } Lexer::TokenType @@ -73,21 +83,15 @@ Lexer::getNextToken() static const char* delims = "\"();"; while(isspace(c)) { - if(c == '\n') - ++linenumber; nextChar(); - }; + } token_length = 0; switch(c) { case ';': // comment - while(true) { + while(c != '\n') { nextChar(); - if(c == '\n') { - ++linenumber; - break; - } } return getNextToken(); // and again case '(': @@ -98,8 +102,8 @@ Lexer::getNextToken() return TOKEN_CLOSE_PAREN; case '"': { // string int startline = linenumber; + nextChar(); while(1) { - nextChar(); switch(c) { case '"': nextChar(); @@ -107,7 +111,6 @@ Lexer::getNextToken() case '\r': continue; case '\n': - linenumber++; break; case '\\': nextChar(); @@ -129,8 +132,7 @@ Lexer::getNextToken() default: break; } - if(token_length < MAX_TOKEN_LENGTH) - token_string[token_length++] = c; + addChar(); } string_finished: token_string[token_length] = 0; @@ -140,9 +142,7 @@ string_finished: nextChar(); while(isalnum(c) || c == '_') { - if(token_length < MAX_TOKEN_LENGTH) - token_string[token_length++] = c; - nextChar(); + addChar(); } token_string[token_length] = 0; @@ -176,10 +176,7 @@ string_finished: else if(isalnum(c) || c == '_') have_nondigits = true; - if(token_length < MAX_TOKEN_LENGTH) - token_string[token_length++] = c; - - nextChar(); + addChar(); } while(!isspace(c) && !strchr(delims, c)); token_string[token_length] = 0; @@ -194,9 +191,7 @@ string_finished: return TOKEN_INTEGER; } else { do { - if(token_length < MAX_TOKEN_LENGTH) - token_string[token_length++] = c; - nextChar(); + addChar(); } while(!isspace(c) && !strchr(delims, c)); token_string[token_length] = 0; diff --git a/src/lisp/lexer.hpp b/src/lisp/lexer.hpp index 5c5f2d58c..f85c86e15 100644 --- a/src/lisp/lexer.hpp +++ b/src/lisp/lexer.hpp @@ -53,6 +53,7 @@ private: }; inline void nextChar(); + inline void addChar(); std::istream& stream; bool eof; diff --git a/src/tinygettext/tinygettext.cpp b/src/tinygettext/tinygettext.cpp index e324745f9..21fea977f 100644 --- a/src/tinygettext/tinygettext.cpp +++ b/src/tinygettext/tinygettext.cpp @@ -540,39 +540,34 @@ Dictionary::add_translation(const std::string& msgid, const std::string& msgstr) class POFileReader { private: - struct Token - { - std::string keyword; - std::string content; - }; - Dictionary& dict; + std::istream& in; std::string from_charset; std::string to_charset; - std::string current_msgid; - std::string current_msgid_plural; - std::map msgstr_plural; - int line_num; - - enum { WANT_MSGID, WANT_MSGSTR, WANT_MSGSTR_PLURAL, WANT_MSGID_PLURAL } state; + int c; //TODO: char c? unsigned char c? + enum Token { + TOKEN_KEYWORD, //msgstr, msgid, etc. + TOKEN_CONTENT, //string literals, concatenated ("" "foo\n" "bar\n" -> "foo\nbar\n") + TOKEN_EOF //ran out of tokens + }; + Token token; + std::string tokenContent; //current contents of the keyword or string literal(s) public: - POFileReader(std::istream& in, Dictionary& dict_) - : dict(dict_) + POFileReader(std::istream& in_, Dictionary& dict_) + : in(in_), dict(dict_) { - state = WANT_MSGID; line_num = 0; - char c = in.get(); - if(c == (char) 0xef) { // skip UTF-8 intro that some texteditors produce - in.get(); - in.get(); - } else { - in.unget(); + nextChar(); + if(c == 0xef) { // skip UTF-8 intro that some text editors produce + nextChar(); + nextChar(); + nextChar(); } - tokenize_po(in); + tokenize_po(); } void parse_header(const std::string& header) @@ -612,181 +607,166 @@ public: } } - void add_token(const Token& token) + inline void nextChar() { - switch(state) - { - case WANT_MSGID: - if (token.keyword == "msgid") - { - current_msgid = token.content; - state = WANT_MSGID_PLURAL; - } - else if (token.keyword.empty()) - { - //log_warning << "Got EOF, everything looks ok." << std::endl; - } - else - { - log_warning << "tinygettext: expected 'msgid' keyword, got " << token.keyword << " at line " << line_num << std::endl; - } - break; - - case WANT_MSGID_PLURAL: - if (token.keyword == "msgid_plural") - { - current_msgid_plural = token.content; - state = WANT_MSGSTR_PLURAL; - } - else - { - state = WANT_MSGSTR; - add_token(token); - } - break; + c = in.get(); + if (c == '\n') + line_num++; + } - case WANT_MSGSTR: - if (token.keyword == "msgstr") - { - if (current_msgid == "") - { // .po Header is hidden in the msgid with the empty string - parse_header(token.content); - } - else - { - dict.add_translation(current_msgid, convert(token.content, from_charset, to_charset)); - } - state = WANT_MSGID; - } - else - { - log_warning << "tinygettext: expected 'msgstr' keyword, got " << token.keyword << " at line " << line_num << std::endl; - } - break; + inline void skipSpace() + { + if(c == EOF) + return; - case WANT_MSGSTR_PLURAL: - if (has_prefix(token.keyword, "msgstr[")) - { - int num; - if (sscanf(token.keyword.c_str(), "msgstr[%d]", &num) != 1) - { - log_warning << "Error: Couldn't parse: " << token.keyword << std::endl; - } - else - { - msgstr_plural[num] = convert(token.content, from_charset, to_charset); - } - } - else - { - dict.add_translation(current_msgid, current_msgid_plural, msgstr_plural); + while(isspace(static_cast(c))) nextChar(); - state = WANT_MSGID; - add_token(token); - } - break; + // Comments are whitespace too (remove if we ever parse comments) + if (c == '#') + { + do { + nextChar(); + } while(c != '\n' && c != EOF); } } - inline int getchar(std::istream& in) - { - int c = in.get(); - if (c == '\n') - line_num += 1; - return c; + inline bool expectToken(std::string type, Token wanted) { + if(token != wanted) { + log_warning << "Expected " << type << ", got "; + if(token == TOKEN_EOF) + log_warning << "EOF"; + else if(token == TOKEN_KEYWORD) + log_warning << "keyword '" << tokenContent << "'"; + else + log_warning << "string \"" << tokenContent << '"'; + + log_warning << " at line " << line_num << std::endl; + return false; + } + return true; } - void tokenize_po(std::istream& in) - { - enum State { READ_KEYWORD, - READ_CONTENT, - READ_CONTENT_IN_STRING, - SKIP_COMMENT }; + inline bool expectContent(std::string type, std::string wanted) { + if(tokenContent != wanted) { + log_warning << "Expected " << type << ", got "; + if(token == TOKEN_EOF) + log_warning << "EOF"; + else if(token == TOKEN_KEYWORD) + log_warning << "keyword '" << tokenContent << "'"; + else + log_warning << "string \"" << tokenContent << '"'; - State state = READ_KEYWORD; - int c; - Token token; + log_warning << " at line " << line_num << std::endl; + return false; + } + return true; + } - while((c = getchar(in)) != EOF) - { - //log_debug << "Lexing char: " << char(c) << " " << state << std::endl; - switch(state) - { - case READ_KEYWORD: - if (c == '#') - { - state = SKIP_COMMENT; - } - else if (c == '\n') - { - } - else - { - // Read a new token - token = Token(); + void tokenize_po() + { + while((token = nextToken()) != TOKEN_EOF) + { + if(!expectToken("'msgid' keyword", TOKEN_KEYWORD) || !expectContent("'msgid' keyword", "msgid")) break; - do { // Read keyword - token.keyword += c; - } while((c = getchar(in)) != EOF && !isspace(static_cast(c))); - in.unget(); + token = nextToken(); + if(!expectToken("name after msgid", TOKEN_CONTENT)) break; + std::string current_msgid = tokenContent; - state = READ_CONTENT; - } - break; - - case READ_CONTENT: - while((c = getchar(in)) != EOF) - { - if (c == '"') { - // Found start of content - state = READ_CONTENT_IN_STRING; - break; - } else if (isspace(static_cast(c))) { - // skip - } else { // Read something that may be a keyword - in.unget(); - state = READ_KEYWORD; - add_token(token); - token = Token(); - break; - } - } - break; + token = nextToken(); + if(!expectToken("msgstr or msgid_plural", TOKEN_KEYWORD)) break; + if(tokenContent == "msgid_plural") + { + //Plural form + token = nextToken(); + if(!expectToken("msgid_plural content", TOKEN_CONTENT)) break; + std::string current_msgid_plural = tokenContent; - case READ_CONTENT_IN_STRING: - if (c == '\\') { - c = getchar(in); - if (c != EOF) + std::map msgstr_plural; + while((token = nextToken()) == TOKEN_KEYWORD && has_prefix(tokenContent, "msgstr[")) { - if (c == 'n') token.content += '\n'; - else if (c == 't') token.content += '\t'; - else if (c == 'r') token.content += '\r'; - else if (c == '"') token.content += '"'; - else if (c == '\\') token.content += '\\'; - else + int num; + if (sscanf(tokenContent.c_str(), "msgstr[%d]", &num) != 1) { - log_warning << "Unhandled escape character: " << char(c) << std::endl; + log_warning << "Error: Couldn't parse: " << tokenContent << std::endl; } + + token = nextToken(); + if(!expectToken("msgstr[x] content", TOKEN_CONTENT)) break; + msgstr_plural[num] = convert(tokenContent, from_charset, to_charset); + } + dict.add_translation(current_msgid, current_msgid_plural, msgstr_plural); + } + else + { + // "Ordinary" translation + if(!expectContent("'msgstr' keyword", "msgstr")) break; + + token = nextToken(); + if(!expectToken("translation in msgstr", TOKEN_CONTENT)) break; + + if (current_msgid == "") + { // .po Header is hidden in the msgid with the empty string + parse_header(tokenContent); } else { - log_warning << "Unterminated string" << std::endl; + dict.add_translation(current_msgid, convert(tokenContent, from_charset, to_charset)); } - } else if (c == '"') { // Content string is terminated - state = READ_CONTENT; - } else { - token.content += c; } - break; + } + } + + Token nextToken() + { + if(c == EOF) + return TOKEN_EOF; + + //Clear token contents + tokenContent = ""; + + skipSpace(); - case SKIP_COMMENT: - if (c == '\n') - state = READ_KEYWORD; - break; + if(c != '"') + { + // Read a keyword + do { + tokenContent += c; + nextChar(); + } while(c != EOF && !isspace(static_cast(c))); + return TOKEN_KEYWORD; + } + else + { + do { + nextChar(); + // Read content + while(c != EOF && c != '"') { + if (c == '\\') { + nextChar(); + if (c == 'n') c = '\n'; + else if (c == 't') c = '\t'; + else if (c == 'r') c = '\r'; + else if (c == '"') c = '"'; + else if (c == '\\') c = '\\'; + else + { + log_warning << "Unhandled escape character: " << char(c) << std::endl; + c = ' '; + } + } + tokenContent += c; + nextChar(); } + if(c == EOF) { + log_warning << "Unclosed string literal: " << tokenContent << std::endl; + return TOKEN_CONTENT; + } + + // Read more strings? + skipSpace(); + } while(c == '"'); + return TOKEN_CONTENT; } - add_token(token); - token = Token(); } }; -- 2.11.0