external/tinygettext/src/po_parser.cpp

   1 //  tinygettext - A gettext replacement that works directly on .po files
   2 //  Copyright (C) 2009 Ingo Ruhnke <grumbel@gmx.de>
   3 //
   4 //  This program is free software; you can redistribute it and/or
   5 //  modify it under the terms of the GNU General Public License
   6 //  as published by the Free Software Foundation; either version 2
   7 //  of the License, or (at your option) any later version.
   8 //
   9 //  This program is distributed in the hope that it will be useful,
  10 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 //  GNU General Public License for more details.
  13 //
  14 //  You should have received a copy of the GNU General Public License
  15 //  along with this program; if not, write to the Free Software
  16 //  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  17
  18 #include "tinygettext/po_parser.hpp"
  19
  20 #include <iostream>
  21 #include <ctype.h>
  22 #include <string>
  23 #include <istream>
  24 #include <string.h>
  25 #include <unordered_map>
  26 #include <stdlib.h>
  27
  28 #include "tinygettext/language.hpp"
  29 #include "tinygettext/log_stream.hpp"
  30 #include "tinygettext/iconv.hpp"
  31 #include "tinygettext/dictionary.hpp"
  32 #include "tinygettext/plural_forms.hpp"
  33
  34 namespace tinygettext {
  35
  36 bool POParser::pedantic = true;
  37 \f
  38 void
  39 POParser::parse(const std::string& filename, std::istream& in, Dictionary& dict)
  40 {
  41   POParser parser(filename, in, dict);
  42   parser.parse();
  43 }
  44 \f
  45 class POParserError {};
  46
  47 POParser::POParser(const std::string& filename_, std::istream& in_, Dictionary& dict_, bool use_fuzzy_) :
  48   filename(filename_),
  49   in(in_),
  50   dict(dict_),
  51   use_fuzzy(use_fuzzy_),
  52   running(false),
  53   eof(false),
  54   big5(false),
  55   line_number(0),
  56   current_line(),
  57   conv()
  58 {
  59 }
  60
  61 POParser::~POParser()
  62 {
  63 }
  64
  65 void
  66 POParser::warning(const std::string& msg)
  67 {
  68   log_warning << filename << ":" << line_number << ": warning: " << msg << ": " << current_line << std::endl;
  69   //log_warning << "Line: " << current_line << std::endl;
  70 }
  71
  72 void
  73 POParser::error(const std::string& msg)
  74 {
  75   log_error << filename << ":" << line_number << ": error: " << msg  << ": " << current_line << std::endl;
  76
  77   // Try to recover from an error by searching for start of another entry
  78   do
  79     next_line();
  80   while(!eof && !is_empty_line());
  81
  82   throw POParserError();
  83 }
  84
  85 void
  86 POParser::next_line()
  87 {
  88   line_number += 1;
  89   if (!std::getline(in, current_line))
  90     eof = true;
  91 }
  92
  93 void
  94 POParser::get_string_line(std::ostringstream& out, size_t skip)
  95 {
  96   if (skip+1 >= static_cast<unsigned int>(current_line.size()))
  97     error("unexpected end of line");
  98
  99   if (current_line[skip] != '"')
 100     error("expected start of string '\"'");
 101
 102   std::string::size_type i;
 103   for(i = skip+1; current_line[i] != '\"'; ++i)
 104   {
 105     if (big5 && static_cast<unsigned char>(current_line[i]) >= 0x81 && static_cast<unsigned char>(current_line[i]) <= 0xfe)
 106     {
 107       out << current_line[i];
 108
 109       i += 1;
 110
 111       if (i >= current_line.size())
 112         error("invalid big5 encoding");
 113
 114       out << current_line[i];
 115     }
 116     else if (i >= current_line.size())
 117     {
 118       error("unexpected end of string");
 119     }
 120     else if (current_line[i] == '\\')
 121     {
 122       i += 1;
 123
 124       if (i >= current_line.size())
 125         error("unexpected end of string in handling '\\'");
 126
 127       switch (current_line[i])
 128       {
 129         case 'a':  out << '\a'; break;
 130         case 'b':  out << '\b'; break;
 131         case 'v':  out << '\v'; break;
 132         case 'n':  out << '\n'; break;
 133         case 't':  out << '\t'; break;
 134         case 'r':  out << '\r'; break;
 135         case '"':  out << '"'; break;
 136         case '\\': out << '\\'; break;
 137         default:
 138           std::ostringstream err;
 139           err << "unhandled escape '\\" << current_line[i] << "'";
 140           warning(err.str());
 141
 142           out << current_line[i-1] << current_line[i];
 143           break;
 144       }
 145     }
 146     else
 147     {
 148       out << current_line[i];
 149     }
 150   }
 151
 152   // process trailing garbage in line and warn if there is any
 153   for(i = i+1; i < current_line.size(); ++i)
 154     if (!isspace(current_line[i]))
 155     {
 156       warning("unexpected garbage after string ignoren");
 157       break;
 158     }
 159 }
 160
 161 std::string
 162 POParser::get_string(unsigned int skip)
 163 {
 164   std::ostringstream out;
 165
 166   if (skip+1 >= static_cast<unsigned int>(current_line.size()))
 167     error("unexpected end of line");
 168
 169   if (current_line[skip] == ' ' && current_line[skip+1] == '"')
 170   {
 171     get_string_line(out, skip+1);
 172   }
 173   else
 174   {
 175     if (pedantic)
 176       warning("keyword and string must be seperated by a single space");
 177
 178     for(;;)
 179     {
 180       if (skip >= static_cast<unsigned int>(current_line.size()))
 181         error("unexpected end of line");
 182       else if (current_line[skip] == '\"')
 183       {
 184         get_string_line(out, skip);
 185         break;
 186       }
 187       else if (!isspace(current_line[skip]))
 188       {
 189         error("string must start with '\"'");
 190       }
 191       else
 192       {
 193         // skip space
 194       }
 195
 196       skip += 1;
 197     }
 198   }
 199
 200 next:
 201   next_line();
 202   for(std::string::size_type i = 0; i < current_line.size(); ++i)
 203   {
 204     if (current_line[i] == '"')
 205     {
 206       if (i == 1)
 207         if (pedantic)
 208           warning("leading whitespace before string");
 209
 210       get_string_line(out,  i);
 211       goto next;
 212     }
 213     else if (isspace(current_line[i]))
 214     {
 215       // skip
 216     }
 217     else
 218     {
 219       break;
 220     }
 221   }
 222
 223   return out.str();
 224 }
 225
 226 static bool has_prefix(const std::string& lhs, const std::string rhs)
 227 {
 228   if (lhs.length() < rhs.length())
 229     return false;
 230   else
 231     return lhs.compare(0, rhs.length(), rhs) == 0;
 232 }
 233
 234 void
 235 POParser::parse_header(const std::string& header)
 236 {
 237   std::string from_charset;
 238   std::string::size_type start = 0;
 239   for(std::string::size_type i = 0; i < header.length(); ++i)
 240   {
 241     if (header[i] == '\n')
 242     {
 243       std::string line = header.substr(start, i - start);
 244
 245       if (has_prefix(line, "Content-Type:"))
 246       {
 247         // from_charset = line.substr(len);
 248         unsigned int len = strlen("Content-Type: text/plain; charset=");
 249         if (line.compare(0, len, "Content-Type: text/plain; charset=") == 0)
 250         {
 251           from_charset = line.substr(len);
 252
 253           for(std::string::iterator ch = from_charset.begin(); ch != from_charset.end(); ++ch)
 254             *ch = static_cast<char>(toupper(*ch));
 255         }
 256         else
 257         {
 258           warning("malformed Content-Type header");
 259         }
 260       }
 261       else if (has_prefix(line, "Plural-Forms:"))
 262       {
 263         PluralForms plural_forms = PluralForms::from_string(line);
 264         if (!plural_forms)
 265         {
 266           warning("unknown Plural-Forms given");
 267         }
 268         else
 269         {
 270           if (!dict.get_plural_forms())
 271           {
 272             dict.set_plural_forms(plural_forms);
 273           }
 274           else
 275           {
 276             if (dict.get_plural_forms() != plural_forms)
 277             {
 278               warning("Plural-Forms missmatch between .po file and dictionary");
 279             }
 280           }
 281         }
 282       }
 283       start = i+1;
 284     }
 285   }
 286
 287   if (from_charset.empty() || from_charset == "CHARSET")
 288   {
 289     warning("charset not specified for .po, fallback to utf-8");
 290     from_charset = "UTF-8";
 291   }
 292   else if (from_charset == "BIG5")
 293   {
 294     big5 = true;
 295   }
 296
 297   conv.set_charsets(from_charset, dict.get_charset());
 298 }
 299
 300 bool
 301 POParser::is_empty_line()
 302 {
 303   if (current_line.empty())
 304   {
 305     return true;
 306   }
 307   else if (current_line[0] == '#')
 308   { // handle comments as empty lines
 309     if (current_line.size() == 1 || (current_line.size() >= 2 && isspace(current_line[1])))
 310       return true;
 311     else
 312       return false;
 313   }
 314   else
 315   {
 316     for(std::string::iterator i = current_line.begin(); i != current_line.end(); ++i)
 317     {
 318       if (!isspace(*i))
 319         return false;
 320     }
 321   }
 322   return true;
 323 }
 324
 325 bool
 326 POParser::prefix(const char* prefix_str)
 327 {
 328   return current_line.compare(0, strlen(prefix_str), prefix_str) == 0;
 329 }
 330
 331 void
 332 POParser::parse()
 333 {
 334   next_line();
 335
 336   // skip UTF-8 intro that some text editors produce
 337   // see http://en.wikipedia.org/wiki/Byte-order_mark
 338   if (current_line.size() >= 3 &&
 339       current_line[0] == static_cast<char>(0xef) &&
 340       current_line[1] == static_cast<char>(0xbb) &&
 341       current_line[2] == static_cast<char>(0xbf))
 342   {
 343     current_line = current_line.substr(3);
 344   }
 345
 346   // Parser structure
 347   while(!eof)
 348   {
 349     try
 350     {
 351       bool fuzzy =  false;
 352       bool has_msgctxt = false;
 353       std::string msgctxt;
 354       std::string msgid;
 355
 356       while(prefix("#"))
 357       {
 358         if (current_line.size() >= 2 && current_line[1] == ',')
 359         {
 360           // FIXME: Rather simplistic hunt for fuzzy flag
 361           if (current_line.find("fuzzy", 2) != std::string::npos)
 362             fuzzy = true;
 363         }
 364
 365         next_line();
 366       }
 367
 368       if (!is_empty_line())
 369       {
 370         if (prefix("msgctxt"))
 371         {
 372           has_msgctxt = true;
 373           msgctxt = get_string(7);
 374         }
 375
 376         if (prefix("msgid"))
 377           msgid = get_string(5);
 378         else
 379           error("expected 'msgid'");
 380
 381         if (prefix("msgid_plural"))
 382         {
 383           std::string msgid_plural = get_string(12);
 384           std::vector<std::string> msgstr_num;
 385           bool saw_nonempty_msgstr = false;
 386
 387         next:
 388           if (is_empty_line())
 389           {
 390             if (msgstr_num.empty())
 391               error("expected 'msgstr[N] (0 <= N <= 9)'");
 392           }
 393           else if (prefix("msgstr[") &&
 394                    current_line.size() > 8 &&
 395                    isdigit(current_line[7]) && current_line[8] == ']')
 396           {
 397             unsigned int number = static_cast<unsigned int>(current_line[7] - '0');
 398             std::string msgstr = get_string(9);
 399
 400             if(!msgstr.empty())
 401               saw_nonempty_msgstr = true;
 402
 403             if (number >= msgstr_num.size())
 404               msgstr_num.resize(number+1);
 405
 406             msgstr_num[number] = conv.convert(msgstr);
 407             goto next;
 408           }
 409           else
 410           {
 411             error("expected 'msgstr[N]'");
 412           }
 413
 414           if (!is_empty_line())
 415             error("expected 'msgstr[N]' or empty line");
 416
 417           if (saw_nonempty_msgstr)
 418           {
 419             if (use_fuzzy || !fuzzy)
 420             {
 421               if (!dict.get_plural_forms())
 422               {
 423                 warning("msgstr[N] seen, but no Plural-Forms given");
 424               }
 425               else
 426               {
 427                 if (msgstr_num.size() != dict.get_plural_forms().get_nplural())
 428                 {
 429                   warning("msgstr[N] count doesn't match Plural-Forms.nplural");
 430                 }
 431               }
 432
 433               if (has_msgctxt)
 434                 dict.add_translation(msgctxt, msgid, msgid_plural, msgstr_num);
 435               else
 436                 dict.add_translation(msgid, msgid_plural, msgstr_num);
 437             }
 438
 439             if (0)
 440             {
 441               std::cout << (fuzzy?"fuzzy":"not-fuzzy") << std::endl;
 442               std::cout << "msgid \"" << msgid << "\"" << std::endl;
 443               std::cout << "msgid_plural \"" << msgid_plural << "\"" << std::endl;
 444               for(std::vector<std::string>::size_type i = 0; i < msgstr_num.size(); ++i)
 445                 std::cout << "msgstr[" << i << "] \"" << conv.convert(msgstr_num[i]) << "\"" << std::endl;
 446               std::cout << std::endl;
 447             }
 448           }
 449         }
 450         else if (prefix("msgstr"))
 451         {
 452           std::string msgstr = get_string(6);
 453
 454           if (msgid.empty())
 455           {
 456             parse_header(msgstr);
 457           }
 458           else if(!msgstr.empty())
 459           {
 460             if (use_fuzzy || !fuzzy)
 461             {
 462               if (has_msgctxt)
 463                 dict.add_translation(msgctxt, msgid, conv.convert(msgstr));
 464               else
 465                 dict.add_translation(msgid, conv.convert(msgstr));
 466             }
 467
 468             if (0)
 469             {
 470               std::cout << (fuzzy?"fuzzy":"not-fuzzy") << std::endl;
 471               std::cout << "msgid \"" << msgid << "\"" << std::endl;
 472               std::cout << "msgstr \"" << conv.convert(msgstr) << "\"" << std::endl;
 473               std::cout << std::endl;
 474             }
 475           }
 476         }
 477         else
 478         {
 479           error("expected 'msgstr' or 'msgid_plural'");
 480         }
 481       }
 482
 483       if (!is_empty_line())
 484         error("expected empty line");
 485
 486       next_line();
 487     }
 488     catch(POParserError&)
 489     {
 490     }
 491   }
 492 }
 493
 494 } // namespace tinygettext
 495
 496 /* EOF */