Skip to content

Commit

Permalink
ICU stuff (#115)
Browse files Browse the repository at this point in the history
ICU changes (closes #81)
- replace all instances of `std::wstring` with `UString` (= `std::basic_string<UChar>`)
- create `InputFile` wrapper to handle UTF-8 streams with nulls

efficiency, readability, and code style changes
- eliminate `Ltstr` and `string_to_wostream`
- simplify Makefile
- make transducer symbols `int32_t` rather than `int`
- make common symbols static attributes of `Transducer`
- extract some other string constants
- prefer `std::vector` to `std::list`
- prefer `.clear()` and `.empty()` to `= ""` and `== ""`
- prefer range-for loops
- remove old lsx code
- have `regex_compiler` iterate over the input string rather than modifying it
- lift a static computation out of a loop in `Transducer::determinize()`
- move constant initializers to class header

helper function and dependency changes
- move `StringUtils` here from apertium
- depend on external utfcpp rather than bundling it
- make `XMLParseUtil` functions more specific to their typical usecases
- add `xml_walk_util.h` for cleanly iterating over children of `xmlNode*`
  • Loading branch information
mr-martian authored Jun 30, 2021
1 parent f2414b9 commit 81de698
Show file tree
Hide file tree
Showing 69 changed files with 3,087 additions and 4,225 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
- name: dependencies
run: |
sudo apt-get -qy update
sudo apt-get -qfy install --no-install-recommends build-essential automake autotools-dev pkg-config libxml2-dev libxml2-utils python3-dev python3-setuptools swig
sudo apt-get -qfy install --no-install-recommends build-essential automake autotools-dev pkg-config libutfcpp-dev libxml2-dev libxml2-utils python3-dev python3-setuptools swig
- name: autoreconf
run: autoreconf -fvi
- name: configure
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,12 @@
/lttoolbox/lt-expand
/python/Makefile
/python/Makefile.in
/python/lttoolbox.i
/python/lttoolbox_wrap.cpp
/python/lttoolbox.py
/python/setup.py
/python/build*
*.egg-info/
*.egg
**/.mypy_cache/
*~
8 changes: 7 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,14 @@ compiler:
- clang
- gcc

addons:
homebrew:
packages:
- icu4c
- utf8cpp

before_install:
- if [ $TRAVIS_OS_NAME = linux ]; then sudo apt-get install -y swig; else brew install swig; fi
- if [ $TRAVIS_OS_NAME = linux ]; then sudo apt-get install -y swig libutfcpp-dev; else brew install swig utf8cpp; fi
script:
- $CXX --version
- autoreconf -fvi
Expand Down
3 changes: 1 addition & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -102,15 +102,14 @@ if(WIN32)
add_definitions(-D_SECURE_SCL=0 -D_ITERATOR_DEBUG_LEVEL=0 -D_CRT_SECURE_NO_DEPRECATE -DWIN32_LEAN_AND_MEAN -DVC_EXTRALEAN -DNOMINMAX)
add_definitions(-DSTDC_HEADERS -DREGEX_MALLOC)
include_directories("lttoolbox/win32")
include_directories("utf8")
else()
add_definitions(-D_POSIX_C_SOURCE=200112 -D_GNU_SOURCE)
endif()

# Unlocked I/O functions
include(CheckSymbolExists)
set(CMAKE_REQUIRED_DEFINITIONS -D_POSIX_C_SOURCE=200112 -D_GNU_SOURCE)
foreach(func fread_unlocked fwrite_unlocked fgetc_unlocked fputc_unlocked fputs_unlocked fgetwc_unlocked fputwc_unlocked fputws_unlocked)
foreach(func fread_unlocked fwrite_unlocked fgetc_unlocked fputc_unlocked fputs_unlocked)
string(TOUPPER ${func} _uc)
CHECK_SYMBOL_EXISTS(${func} "stdio.h" HAVE_DECL_${_uc})
if(HAVE_DECL_${_uc})
Expand Down
2 changes: 1 addition & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ endif
pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = lttoolbox.pc

EXTRA_DIST=autogen.sh utf8 tests
EXTRA_DIST=autogen.sh tests

# TODO: the below will use python3 if you run it on Arch Linux with no python2 installed
test: tests/run_tests.py
Expand Down
2 changes: 2 additions & 0 deletions README
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ Requirements:
* g++ >= 2.95
* GNU make
* libxml2 >= 2.6.17
* ICU
* utfcpp

Building & installing:

Expand Down
32 changes: 6 additions & 26 deletions configure.ac
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
AC_PREREQ(2.52)

m4_define([PKG_VERSION_MAJOR], [3])
m4_define([PKG_VERSION_MINOR], [5])
m4_define([PKG_VERSION_PATCH], [3])
m4_define([PKG_VERSION_MINOR], [6])
m4_define([PKG_VERSION_PATCH], [0])

AC_INIT([lttoolbox], [PKG_VERSION_MAJOR.PKG_VERSION_MINOR.PKG_VERSION_PATCH], [[email protected]], [lttoolbox], [https://wiki.apertium.org/wiki/Lttoolbox])

Expand Down Expand Up @@ -38,36 +38,16 @@ AC_ARG_ENABLE(profile,
[CXXFLAGS="-pg -g -Wall"; CFLAGS="-pg -g -Wall"; LDFLAGS="-pg"])


PKG_CHECK_MODULES(LTTOOLBOX, [libxml-2.0 >= 2.6.17])

# Check for wide strings
AC_DEFUN([AC_CXX_WSTRING],[
AC_CACHE_CHECK(whether the compiler supports wide strings,
ac_cv_cxx_wstring,
[AC_LANG_SAVE
AC_LANG_CPLUSPLUS
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <string>]],[[
std::wstring test = L"test";
]])],
[ac_cv_cxx_wstring=yes], [ac_cv_cxx_wstring=no])
AC_LANG_RESTORE
])
])

AC_CXX_WSTRING

if test "$ac_cv_cxx_wstring" = no
then
AC_MSG_ERROR([Missing wide string support])
fi

PKG_CHECK_MODULES(LIBXML, [libxml-2.0 >= 2.6.17])
PKG_CHECK_MODULES(ICU, [icu-i18n, icu-io, icu-uc])

# Checks for libraries.
AC_CHECK_LIB(xml2, xmlReaderForFile)

# Checks for header files.
AC_HEADER_STDC
AC_CHECK_HEADERS([stdlib.h string.h unistd.h stddef.h])
AC_CHECK_HEADER([utf8.h], [], [AC_MSG_ERROR([You don't have utfcpp installed.])])

# Checks for typedefs, structures, and compiler characteristics.
AC_HEADER_STDBOOL
Expand All @@ -78,7 +58,7 @@ AC_TYPE_SIZE_T
AC_FUNC_ERROR_AT_LINE

AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, \
fputc_unlocked, fputs_unlocked, fgetwc_unlocked, fputwc_unlocked, fputws_unlocked, ungetwc_unlocked])
fputc_unlocked, fputs_unlocked])

AC_CHECK_FUNCS([setlocale strdup getopt_long])

Expand Down
1 change: 0 additions & 1 deletion lttoolbox/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ if(WIN32)
win32/regex.c
win32/regex.h
win32/unistd.h
${CMAKE_SOURCE_DIR}/utf8/utf8_fwrap.h
${LIBLTTOOLBOX_SOURCES}
)
if(NOT VCPKG_TOOLCHAIN)
Expand Down
44 changes: 12 additions & 32 deletions lttoolbox/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@

h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \
deserialiser.h entry_token.h expander.h fst_processor.h lt_locale.h \
ltstr.h match_exe.h match_node.h match_state.h my_stdio.h node.h \
pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h \
transducer.h trans_exe.h xml_parse_util.h exception.h tmx_compiler.h \
string_to_wostream.h
deserialiser.h entry_token.h expander.h fst_processor.h input_file.h lt_locale.h \
match_exe.h match_node.h match_state.h my_stdio.h node.h \
pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h \
transducer.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \
ustring.h
cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \
expander.cc fst_processor.cc lt_locale.cc match_exe.cc \
expander.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \
match_node.cc match_state.cc node.cc pattern_list.cc \
regexp_compiler.cc sorted_vector.cc state.cc transducer.cc \
trans_exe.cc xml_parse_util.cc tmx_compiler.cc
regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc transducer.cc \
trans_exe.cc xml_parse_util.cc xml_walk_util.cc tmx_compiler.cc ustring.cc

library_includedir = $(includedir)/$(PACKAGE_NAME)-$(VERSION_API)/$(PACKAGE_NAME)
library_include_HEADERS = $(h_sources)
Expand All @@ -27,33 +27,16 @@ lttoolboxlib = $(prefix)/lib

lttoolbox_DATA = dix.dtd dix.rng dix.rnc acx.rng xsd/dix.xsd xsd/acx.xsd

lt_print_SOURCES = lt_print.cc
lt_print_LDADD = liblttoolbox$(VERSION_MAJOR).la
lt_print_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS)
LDADD = liblttoolbox$(VERSION_MAJOR).la
AM_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LIBXML_LIBS) $(ICU_LIBS)

lt_print_SOURCES = lt_print.cc
lt_trim_SOURCES = lt_trim.cc
lt_trim_LDADD = liblttoolbox$(VERSION_MAJOR).la
lt_trim_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS)

lt_comp_SOURCES = lt_comp.cc
lt_comp_LDADD = liblttoolbox$(VERSION_MAJOR).la
lt_comp_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS)

lt_proc_SOURCES = lt_proc.cc
lt_proc_LDADD = liblttoolbox$(VERSION_MAJOR).la
lt_proc_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS)

lt_expand_SOURCES = lt_expand.cc
lt_expand_LDADD = liblttoolbox$(VERSION_MAJOR).la
lt_expand_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS)

lt_tmxcomp_SOURCES = lt_tmxcomp.cc
lt_tmxcomp_LDADD = liblttoolbox$(VERSION_MAJOR).la
lt_tmxcomp_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS)

lt_tmxproc_SOURCES = lt_tmxproc.cc
lt_tmxproc_LDADD = liblttoolbox$(VERSION_MAJOR).la
lt_tmxproc_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS)

#lt-validate-dictionary: Makefile.am validate-header.sh
# @echo "Creating lt-validate-dictionary script"
Expand All @@ -67,10 +50,7 @@ lt_tmxproc_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS)

man_MANS = lt-comp.1 lt-expand.1 lt-proc.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1

INCLUDES = -I$(top_srcdir) $(LTTOOLBOX_CFLAGS)
if WINDOWS
INCLUDES += -I$(top_srcdir)/utf8
endif
INCLUDES = -I$(top_srcdir) $(LIBXML_CFLAGS) $(ICU_CFLAGS)
CLEANFILES = *~

EXTRA_DIST = dix.dtd dix.rng dix.rnc acx.rng xsd/dix.xsd xsd/acx.xsd $(man_MANS)
Loading

0 comments on commit 81de698

Please sign in to comment.