apertium · mr-martian · Jun 30, 2021 · May 22, 2021 · May 26, 2021 · May 26, 2021
diff --git a/.gitignore b/.gitignore
@@ -80,3 +80,5 @@
 *.egg-info/
 *.egg
 **/.mypy_cache/
+
+*~
diff --git a/configure.ac b/configure.ac
@@ -38,7 +38,8 @@ AC_ARG_ENABLE(profile,
               [CXXFLAGS="-pg -g -Wall"; CFLAGS="-pg -g -Wall"; LDFLAGS="-pg"])
 
 
-PKG_CHECK_MODULES(LTTOOLBOX, [libxml-2.0 >= 2.6.17])
+PKG_CHECK_MODULES(LIBXML, [libxml-2.0 >= 2.6.17])
+PKG_CHECK_MODULES(ICU, [icu-i18n, icu-io, icu-uc])
 
 # Check for wide strings
 AC_DEFUN([AC_CXX_WSTRING],[

diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am
@@ -3,12 +3,13 @@ h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h  \
             deserialiser.h entry_token.h expander.h fst_processor.h lt_locale.h \
             ltstr.h match_exe.h match_node.h match_state.h my_stdio.h node.h \
             pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h \
+			string_utils.h \
             transducer.h trans_exe.h xml_parse_util.h exception.h tmx_compiler.h \
             string_to_wostream.h
 cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \
              expander.cc fst_processor.cc lt_locale.cc match_exe.cc \
              match_node.cc match_state.cc node.cc pattern_list.cc \
-             regexp_compiler.cc sorted_vector.cc state.cc transducer.cc \
+             regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc transducer.cc \
              trans_exe.cc xml_parse_util.cc tmx_compiler.cc
 
 library_includedir = $(includedir)/$(PACKAGE_NAME)-$(VERSION_API)/$(PACKAGE_NAME)
@@ -27,33 +28,16 @@ lttoolboxlib = $(prefix)/lib
 
 lttoolbox_DATA = dix.dtd dix.rng dix.rnc acx.rng xsd/dix.xsd xsd/acx.xsd
 
-lt_print_SOURCES = lt_print.cc
-lt_print_LDADD = liblttoolbox$(VERSION_MAJOR).la
-lt_print_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS)
+LDADD = liblttoolbox$(VERSION_MAJOR).la
+AM_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LIBXML_LIBS) $(ICU_LIBS)
 
+lt_print_SOURCES = lt_print.cc
 lt_trim_SOURCES = lt_trim.cc
-lt_trim_LDADD = liblttoolbox$(VERSION_MAJOR).la
-lt_trim_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS)
-
 lt_comp_SOURCES = lt_comp.cc
-lt_comp_LDADD = liblttoolbox$(VERSION_MAJOR).la
-lt_comp_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS)
-
 lt_proc_SOURCES = lt_proc.cc
-lt_proc_LDADD = liblttoolbox$(VERSION_MAJOR).la
-lt_proc_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS)
-
 lt_expand_SOURCES = lt_expand.cc
-lt_expand_LDADD = liblttoolbox$(VERSION_MAJOR).la
-lt_expand_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS)
-
 lt_tmxcomp_SOURCES = lt_tmxcomp.cc
-lt_tmxcomp_LDADD = liblttoolbox$(VERSION_MAJOR).la
-lt_tmxcomp_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS)
-
 lt_tmxproc_SOURCES = lt_tmxproc.cc
-lt_tmxproc_LDADD = liblttoolbox$(VERSION_MAJOR).la
-lt_tmxproc_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS)
 
 #lt-validate-dictionary: Makefile.am validate-header.sh
 #	@echo "Creating lt-validate-dictionary script"
@@ -67,7 +51,7 @@ lt_tmxproc_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS)
 
 man_MANS = lt-comp.1 lt-expand.1 lt-proc.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1
 
-INCLUDES = -I$(top_srcdir) $(LTTOOLBOX_CFLAGS)
+INCLUDES = -I$(top_srcdir) $(LIBXML_CFLAGS) $(ICU_CFLAGS)
 if WINDOWS
   INCLUDES += -I$(top_srcdir)/utf8
 endif

diff --git a/lttoolbox/alphabet.cc b/lttoolbox/alphabet.cc
@@ -26,11 +26,10 @@
 #include <cwchar>
 #include <cwctype>
 
-#if defined(_WIN32) && !defined(_MSC_VER)
-#include <utf8_fwrap.h>
-#endif
+#include "string_utils.h"
 
 using namespace std;
+using namespace icu;
 
 Alphabet::Alphabet()
 {
@@ -74,7 +73,7 @@ Alphabet::copy(Alphabet const &a)
 }
 
 void
-Alphabet::includeSymbol(wstring const &s)
+Alphabet::includeSymbol(UnicodeString const &s)
 {
   if(slexic.find(s) == slexic.end())
   {
@@ -99,13 +98,13 @@ Alphabet::operator()(int const c1, int const c2)
 }
 
 int
-Alphabet::operator()(wstring const &s)
+Alphabet::operator()(UnicodeString const &s)
 {
   return slexic[s];
 }
 
 int
-Alphabet::operator()(wstring const &s) const
+Alphabet::operator()(UnicodeString const &s) const
 {
   auto it = slexic.find(s);
   if (it == slexic.end()) {
@@ -115,7 +114,7 @@ Alphabet::operator()(wstring const &s) const
 }
 
 bool
-Alphabet::isSymbolDefined(wstring const &s)
+Alphabet::isSymbolDefined(UnicodeString const &s)
 {
   return slexic.find(s) != slexic.end();
 }
@@ -133,7 +132,7 @@ Alphabet::write(FILE *output)
   Compression::multibyte_write(slexicinv.size(), output);  // taglist size
   for(unsigned int i = 0, limit = slexicinv.size(); i < limit; i++)
   {
-    Compression::wstring_write(slexicinv[i].substr(1, slexicinv[i].size()-2), output);
+    Compression::string_write(slexicinv[i].tempSubString(1, slexicinv[i].length()-2), output);
   }
 
   // Then we write the list of pairs
@@ -160,7 +159,7 @@ Alphabet::read(FILE *input)
   while(tam > 0)
   {
     tam--;
-    wstring mytag = L"<" + Compression::wstring_read(input) + L">";
+    UnicodeString mytag = "<" + Compression::string_read(input) + ">";
     a_new.slexicinv.push_back(mytag);
     a_new.slexic[mytag]= -a_new.slexicinv.size(); // ToDo: This does not turn the result negative due to unsigned semantics
   }
@@ -185,7 +184,7 @@ Alphabet::read(FILE *input)
 void
 Alphabet::serialise(std::ostream &serialised) const
 {
-  Serialiser<const vector<wstring> >::serialise(slexicinv, serialised);
+  Serialiser<const vector<UnicodeString> >::serialise(slexicinv, serialised);
   Serialiser<vector<pair<int, int> > >::serialise(spairinv, serialised);
 }
 
@@ -196,7 +195,7 @@ Alphabet::deserialise(std::istream &serialised)
   slexic.clear();
   spairinv.clear();
   spair.clear();
-  slexicinv = Deserialiser<vector<wstring> >::deserialise(serialised);
+  slexicinv = Deserialiser<vector<UnicodeString> >::deserialise(serialised);
   for (size_t i = 0; i < slexicinv.size(); i++) {
     slexic[slexicinv[i]] = -i - 1; // ToDo: This does not turn the result negative due to unsigned semantics
   }
@@ -207,20 +206,20 @@ Alphabet::deserialise(std::istream &serialised)
 }
 
 void
-Alphabet::writeSymbol(int const symbol, FILE *output) const
+Alphabet::writeSymbol(int const symbol, UFILE *output) const
 {
   if(symbol < 0)
   {
-    fputws_unlocked(slexicinv[-symbol-1].c_str(), output);
+    u_fputs(slexicinv[-symbol-1], output);
   }
   else
   {
-    fputwc_unlocked(static_cast<wchar_t>(symbol), output);
+    u_fputc(static_cast<UChar>(symbol), output);
   }
 }
 
 void
-Alphabet::getSymbol(wstring &result, int const symbol, bool uppercase) const
+Alphabet::getSymbol(UnicodeString &result, int const symbol, bool uppercase) const
 {
   if(symbol == 0)
   {
@@ -231,7 +230,7 @@ Alphabet::getSymbol(wstring &result, int const symbol, bool uppercase) const
   {
     if(symbol >= 0)
     {
-      result += static_cast<wchar_t>(symbol);
+      result += static_cast<UChar>(symbol);
     }
     else
     {
@@ -240,7 +239,7 @@ Alphabet::getSymbol(wstring &result, int const symbol, bool uppercase) const
   }
   else if(symbol >= 0)
   {
-    result += static_cast<wchar_t>(towupper(static_cast<wint_t>(symbol)));
+    result += static_cast<UChar>(toupper(static_cast<wint_t>(symbol)));
   }
   else
   {
@@ -261,7 +260,7 @@ Alphabet::decode(int const code) const
 }
 
 set<int>
-Alphabet::symbolsWhereLeftIs(wchar_t l) const {
+Alphabet::symbolsWhereLeftIs(UChar l) const {
   set<int> eps;
   for(const auto& sp: spair) {  // [(l, r) : tag]
     if(sp.first.first == l) {
@@ -271,7 +270,7 @@ Alphabet::symbolsWhereLeftIs(wchar_t l) const {
   return eps;
 }
 
-void Alphabet::setSymbol(int symbol, wstring newSymbolString) {
+void Alphabet::setSymbol(int symbol, UnicodeString newSymbolString) {
   //Should be a special character!
   if (symbol < 0) slexicinv[-symbol-1] = newSymbolString;
 }

diff --git a/lttoolbox/alphabet.h b/lttoolbox/alphabet.h
@@ -22,10 +22,11 @@
 #include <map>
 #include <set>
 #include <vector>
-
-#include <lttoolbox/ltstr.h>
+#include <unicode/unistr.h>
+#include <unicode/ustdio.h>
 
 using namespace std;
+using namespace icu;
 
 /**
  * Alphabet class.
@@ -38,13 +39,13 @@ class Alphabet
    * Symbol-identifier relationship. Only contains <tags>.
    * @see slexicinv
    */
-  map<wstring, int, Ltstr> slexic;
+  map<UnicodeString, int> slexic;
 
   /**
    * Identifier-symbol relationship. Only contains <tags>.
    * @see slexic
    */
-  vector<wstring> slexicinv;
+  vector<UnicodeString> slexicinv;
 
 
   /**
@@ -89,7 +90,7 @@ class Alphabet
   /**
    * Include a symbol into the alphabet.
    */
-  void includeSymbol(wstring const &s);
+  void includeSymbol(UnicodeString const &s);
 
   /**
    * Get an unique code for every symbol pair.  This flavour is for
@@ -99,22 +100,22 @@ class Alphabet
    * @return code for (c1, c2).
    */
   int operator()(int const c1, int const c2);
-  int operator()(wstring const &s) const;
+  int operator()(UnicodeString const &s) const;
 
   /**
    * Gets the individual symbol identifier. Assumes it already exists!
    * @see isSymbolDefined to check if it exists first.
    * @param s symbol to be identified.
    * @return symbol identifier.
    */
-  int operator()(wstring const &s);
+  int operator()(UnicodeString const &s);
 
   /**
    * Check wether the symbol is defined in the alphabet.
    * @param s symbol
    * @return true if defined
    */
-  bool isSymbolDefined(wstring const &s);
+  bool isSymbolDefined(UnicodeString const &s);
 
   /**
    * Returns the size of the alphabet (number of symbols).
@@ -142,15 +143,15 @@ class Alphabet
    * @param symbol symbol code.
    * @param output output stream.
    */
-  void writeSymbol(int const symbol, FILE *output) const;
+  void writeSymbol(int const symbol, UFILE *output) const;
 
   /**
    * Concat a symbol in the string that is passed by reference.
    * @param result string where the symbol should be concatenated
    * @param symbol code of the symbol
    * @param uppercase true if we want an uppercase symbol
    */
-  void getSymbol(wstring &result, int const symbol,
+  void getSymbol(UnicodeString &result, int const symbol,
 		 bool uppercase = false) const;
 
   /**
@@ -165,7 +166,7 @@ class Alphabet
    * @param symbol the code of the symbol to set
    * @param newSymbolString the new string for this symbol
    */
-  void setSymbol(int symbol, wstring newSymbolString);
+  void setSymbol(int symbol, UnicodeString newSymbolString);
 
   /**
    * Note: both the symbol int and int-pair are specific to this alphabet instance.
@@ -178,7 +179,7 @@ class Alphabet
   /**
    * Get all symbols where the left-hand side of the symbol-pair is l.
    */
-  set<int> symbolsWhereLeftIs(wchar_t l) const;
+  set<int> symbolsWhereLeftIs(UChar l) const;
 
   enum Side
   {
-Original file line number
+Diff line change
@@ Expand Up / @@ -80,3 +80,5 @@ @@
     *.egg-info/
     *.egg
     **/.mypy_cache/
+    *~