diff --git a/.github/workflows/autotools-macos.yml b/.github/workflows/autotools-macos.yml index e904d71b1d..6b29fc4fd5 100644 --- a/.github/workflows/autotools-macos.yml +++ b/.github/workflows/autotools-macos.yml @@ -14,8 +14,7 @@ jobs: fail-fast: false matrix: config: - - { name: macos-12-clang-14-autotools, os: macos-12, cxx: clang++ } - #- { name: macos-12-gcc-11-autotools, os: macos-12, cxx: g++-11 } + - { name: macos-latest-clang-autotools, os: macos-latest, cxx: clang++ } steps: - uses: actions/checkout@v4 @@ -30,15 +29,11 @@ jobs: - name: Install dependencies run: | - brew install autoconf automake - brew install leptonica - brew install cairo pango icu4c - brew install cabextract - brew install libarchive curl + brew install autoconf automake cabextract libtool + brew install curl icu4c leptonica libarchive pango - name: Setup Tesseract run: | - mkdir -p m4 ./autogen.sh - name: Configure Tesseract @@ -115,7 +110,7 @@ jobs: fail-fast: false matrix: config: - - { name: macos-12-clang-14-autotools, os: macos-12, cxx: clang++ } + - { name: macos-latest-clang-autotools, os: macos-latest, cxx: clang++ } steps: - uses: actions/checkout@v4 @@ -130,7 +125,7 @@ jobs: - name: Install Macports run: | - curl -LO https://raw.githubusercontent.com/GiovanniBussi/macports-ci/master/macports-ci; source ./macports-ci install + curl -sSLO https://raw.githubusercontent.com/GiovanniBussi/macports-ci/master/macports-ci; source ./macports-ci install # --remove-brew does not remove the Homebrew entries in bin, # so remove them now. rm -v $(brew --prefix)/bin/* @@ -145,7 +140,6 @@ jobs: - name: Setup Tesseract run: | - mkdir -p m4 ./autogen.sh - name: Configure Tesseract diff --git a/.github/workflows/autotools-openmp.yml b/.github/workflows/autotools-openmp.yml index 0bebebfbb9..9ef0c13285 100644 --- a/.github/workflows/autotools-openmp.yml +++ b/.github/workflows/autotools-openmp.yml @@ -13,7 +13,7 @@ jobs: fail-fast: false matrix: config: - - { name: 20.04-openmp, os: ubuntu-20.04 } + - { name: 24.04-openmp, os: ubuntu-24.04 } - { name: 22.04-openmp, os: ubuntu-22.04 } steps: @@ -37,7 +37,6 @@ jobs: - name: Setup Tesseract run: | - mkdir -p m4 ./autogen.sh - name: Configure Tesseract diff --git a/.github/workflows/autotools.yml b/.github/workflows/autotools.yml index 5cdc1ffc92..ff61befb2d 100644 --- a/.github/workflows/autotools.yml +++ b/.github/workflows/autotools.yml @@ -13,10 +13,11 @@ jobs: fail-fast: false matrix: config: - # - { name: ubuntu-22.04-clang-15-autotools, os: ubuntu-22.04, cxx: clang++-15 } #installed + - { name: ubuntu-22.04-clang-15-autotools, os: ubuntu-22.04, cxx: clang++-15 } #installed - { name: ubuntu-24.04-gcc-14-autotools, os: ubuntu-24.04, cxx: g++-14 } #installed - - { name: ubuntu-24.04-gcc-13-autotools, os: ubuntu-24.04, cxx: g++-13 } #installed + + # - { name: ubuntu-24.04-gcc-13-autotools, os: ubuntu-24.04, cxx: g++-13 } #installed # - { name: ubuntu-22.04-gcc-12-autotools, os: ubuntu-22.04, cxx: g++-12 } #installed # - { name: ubuntu-22.04-gcc-11-autotools, os: ubuntu-22.04, cxx: g++-11 } #installed # - { name: ubuntu-20.04-gcc-10-autotools, os: ubuntu-20.04, cxx: g++-10 } #installed @@ -47,7 +48,6 @@ jobs: - name: Setup Tesseract run: | - mkdir -p m4 ./autogen.sh - name: Configure Tesseract diff --git a/.github/workflows/cmake-win64.yml b/.github/workflows/cmake-win64.yml index eacefc6c6a..9f450370c6 100644 --- a/.github/workflows/cmake-win64.yml +++ b/.github/workflows/cmake-win64.yml @@ -4,12 +4,12 @@ name: cmake-win64 on: #push: schedule: - - cron: 0 23 * * * + - cron: 0 5 * * * workflow_dispatch: env: ILOC: d:/a/local - png_ver: 1643 + png_ver: 1644 jobs: build: @@ -37,9 +37,9 @@ jobs: run: | mkdir ${{env.ILOC}} - - name: Uninstall Perl - run: | - choco uninstall strawberryperl + #- name: Uninstall Perl + # run: | + # choco uninstall strawberryperl - name: Build and Install zlib-ng shell: cmd @@ -117,8 +117,8 @@ jobs: - name: Display Tesseract Version and Test Command Line Usage shell: cmd run: | - curl -L https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata --output ${{env.ILOC}}/share/tessdata/eng.traineddata - curl -L https://github.com/tesseract-ocr/tessdata/raw/main/osd.traineddata --output ${{env.ILOC}}/share/tessdata/osd.traineddata + curl -sSL https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata --output ${{env.ILOC}}/share/tessdata/eng.traineddata + curl -sSL https://github.com/tesseract-ocr/tessdata/raw/main/osd.traineddata --output ${{env.ILOC}}/share/tessdata/osd.traineddata echo "Setting TESSDATA_PREFIX..." set TESSDATA_PREFIX=${{env.ILOC}}/share/tessdata echo "Setting PATH..." diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index de0af27835..0178834b1d 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -14,18 +14,14 @@ jobs: fail-fast: false matrix: config: - - - { name: macos-12-clang-14-cmake, os: macos-12, cxx: clang++ } # default - - { name: macos-11-clang-13-cmake, os: macos-11, cxx: clang++ } # default - - - { name: macos-11-gcc-12-cmake, os: macos-11, cxx: g++-12 } #installed + - { name: macos-14-clang-15-cmake, os: macos-14, cxx: clang++ } # default + - { name: macos-14-gcc-14-cmake, os: macos-14, cxx: g++-14 } #installed + - { name: macos-15-clang-cmake, os: macos-15, cxx: clang++ } # default - { name: ubuntu-22.04-clang-15-cmake, os: ubuntu-22.04, cxx: clang++-15 } #installed - + - { name: ubuntu-24.04-gcc-12-cmake, os: ubuntu-24.04, cxx: g++-14 } #installed - { name: ubuntu-22.04-gcc-12-cmake, os: ubuntu-22.04, cxx: g++-12 } #installed - - { name: ubuntu-22.04-gcc-11-cmake, os: ubuntu-22.04, cxx: g++-11 } #installed - { name: ubuntu-20.04-gcc-10-cmake, os: ubuntu-20.04, cxx: g++-10 } #installed - - { name: ubuntu-20.04-gcc-9-cmake, os: ubuntu-20.04, cxx: g++-9 } #installed steps: - name: Install compilers on Linux @@ -55,6 +51,8 @@ jobs: brew install ninja ninja --version cmake --version + clang++ --version + g++ --version if: runner.os == 'macOS' - name: Checkout Source diff --git a/.github/workflows/installer-for-windows.yml b/.github/workflows/installer-for-windows.yml index 0a2b627840..12189231dc 100644 --- a/.github/workflows/installer-for-windows.yml +++ b/.github/workflows/installer-for-windows.yml @@ -39,7 +39,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Build Tesseract installer (64 bit) - run: .github/workflows/build.sh x86_64 + run: nsis/build.sh x86_64 - uses: actions/upload-artifact@v4 with: name: Tesseract Installer for Windows (64 bit) diff --git a/.github/workflows/msys2-4.1.1.yml b/.github/workflows/msys2-4.1.1.yml deleted file mode 100644 index 47da44a437..0000000000 --- a/.github/workflows/msys2-4.1.1.yml +++ /dev/null @@ -1,31 +0,0 @@ -name: msys2-4.1.1 -on: - #push: - schedule: - - cron: 0 18 1 * * -jobs: - windows: - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - include: - - os: windows-2019 - msystem: MINGW32 - mingw_package_prefix: mingw-w64-i686 - - os: windows-2019 - msystem: MINGW64 - mingw_package_prefix: mingw-w64-x86_64 - defaults: - run: - shell: msys2 {0} - steps: - - uses: msys2/setup-msys2@v2 - with: - msystem: ${{ matrix.msystem }} - - run: pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-tesseract-ocr - - name: Display version - run: | - tesseract -v - text2image -v - lstmtraining -v diff --git a/.github/workflows/msys2.yml b/.github/workflows/msys2.yml index ceb45225c6..4ebb7a72a9 100644 --- a/.github/workflows/msys2.yml +++ b/.github/workflows/msys2.yml @@ -41,7 +41,6 @@ jobs: - name: Setup Tesseract run: | - mkdir -p m4 ./autogen.sh - name: Configure Tesseract diff --git a/.github/workflows/pkg-config-crosswrapper b/.github/workflows/pkg-config-crosswrapper deleted file mode 100755 index 86e2aed63b..0000000000 --- a/.github/workflows/pkg-config-crosswrapper +++ /dev/null @@ -1,54 +0,0 @@ -#! /bin/sh -# pkg-config wrapper for cross-building -# Sets pkg-config search path to search multiarch and historical cross-compiling paths. - -# If the user has already set PKG_CONFIG_LIBDIR, believe it (even if empty): -# it's documented to be an override -if [ x"${PKG_CONFIG_LIBDIR+set}" = x ]; then - # GNU triplet for the compiler, e.g. i486-linux-gnu for Debian i386, - # i686-linux-gnu for Ubuntu i386 - basename="$(basename "$0")" - triplet="${basename%-pkg-config}" - # Normalized multiarch path if any, e.g. i386-linux-gnu for i386 - dpkg-architecture >/dev/null 2>&1 - if [ "$?" != 0 ]; then - # dpkg-architecture is missing. - echo "Please install dpkg-dev to use pkg-config when cross-building" >&2 - exit 1 - fi - multiarch="$(dpkg-architecture -t"${triplet}" -qDEB_HOST_MULTIARCH 2>/dev/null)" - # Native multiarch path - native_multiarch="$(cat /usr/lib/pkg-config.multiarch)" - - # This can be used for native builds as well, in that case, just exec pkg-config "$@" directly. - if [ "$native_multiarch" = "$multiarch" ]; then - exec pkg-config "$@" - fi - - PKG_CONFIG_LIBDIR="/usr/local/${triplet}/lib/pkgconfig" - # For a native build we would also want to append /usr/local/lib/pkgconfig - # at this point; but this is a cross-building script, so don't - PKG_CONFIG_LIBDIR="$PKG_CONFIG_LIBDIR:/usr/local/share/pkgconfig" - - if [ -n "$multiarch" ]; then - PKG_CONFIG_LIBDIR="/usr/local/lib/${multiarch}/pkgconfig:$PKG_CONFIG_LIBDIR" - PKG_CONFIG_LIBDIR="$PKG_CONFIG_LIBDIR:/usr/lib/${multiarch}/pkgconfig" - fi - - PKG_CONFIG_LIBDIR="$PKG_CONFIG_LIBDIR:/usr/${triplet}/lib/pkgconfig" - # For a native build we would also want to append /usr/lib/pkgconfig - # at this point; but this is a cross-building script, so don't - # If you want to allow use of un-multiarched -dev packages for crossing - # (at the risk of finding build-arch stuff you didn't want, if not in a clean chroot) - # Uncomment the next line: - # PKG_CONFIG_LIBDIR="$PKG_CONFIG_LIBDIR:/usr/lib/pkgconfig" - # ... but on Ubuntu we rely cross-building with non-multiarch libraries: - if dpkg-vendor --derives-from Ubuntu; then - PKG_CONFIG_LIBDIR="$PKG_CONFIG_LIBDIR:/usr/lib/pkgconfig" - fi - PKG_CONFIG_LIBDIR="$PKG_CONFIG_LIBDIR:/usr/share/pkgconfig" - - export PKG_CONFIG_LIBDIR -fi - -exec pkg-config "$@" diff --git a/.github/workflows/sw.yml b/.github/workflows/sw.yml index c744369086..db11bb5b57 100644 --- a/.github/workflows/sw.yml +++ b/.github/workflows/sw.yml @@ -1,23 +1,9 @@ name: sw on: - push: - paths: - - '**.cpp' - - '**.h' - - '**/sw.yml' - - 'unittest/**.c' - - 'unittest/**.cc' - pull_request: - paths: - - '**.cpp' - - '**.h' - - '**/sw.yml' - - 'unittest/**.c' - - 'unittest/**.cc' schedule: - # every day - - cron: 0 0 * * * + # every 3rd day + - cron: 0 0 */3 * * jobs: build: diff --git a/.github/workflows/unittest-cmake.yml b/.github/workflows/unittest-cmake.yml new file mode 100644 index 0000000000..0174499dc6 --- /dev/null +++ b/.github/workflows/unittest-cmake.yml @@ -0,0 +1,76 @@ +name: unittest_cmake +# autotools build on ubuntu. unittests with address sanitizers. with openmp. +# ubuntu-20.04-gcc-unittest - CI runs out of diskspace. +on: + #push: + pull_request: + paths: + - '**.cpp' + - '**.h' + - '**Makefile.am' + - '/configure.ac' + - 'unittest/**.c' + - 'unittest/**.cc' + schedule: + - cron: 0 0 * * * + workflow_dispatch: + +jobs: + sanitizers: + name: ${{ matrix.config.name }} + runs-on: ${{ matrix.config.os }} + strategy: + fail-fast: false + matrix: + config: + - { name: ubuntu-24.04-gcc-unittest, os: ubuntu-24.04, cxx: g++, cxxflags: '-g -O2 -fsanitize=address,undefined' } + - { name: ubuntu-22.04-clang-unittest, os: ubuntu-22.04, cxx: clang++, cxxflags: '-g -O2 -fsanitize=address,undefined -stdlib=libc++' } + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Remove Homebrew, Android and .NET to provide more disk space + run: | + # https://github.com/actions/virtual-environments/issues/2606#issuecomment-772683150 + sudo rm -rf /home/linuxbrew # will release Homebrew + sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android + sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET + + - name: Install dependencies (Linux) + run: | + sudo apt-get update + sudo apt-get install autoconf libleptonica-dev libpango1.0-dev -y + sudo apt-get install cabextract -y + + - name: Setup + run: | + ./autogen.sh + + - name: Configure (Linux) + run: | + cmake -S . -B build -DSW_BUILD=OFF -DBUILD_SHARED_LIBS=OFF -DBUILD_TRAINING_TOOLS=ON -DBUILD_TESTS=ON \ + -DCMAKE_BUILD_TYPE=Release 'CXX=${{ matrix.config.cxx }}' 'CXXFLAGS=${{ matrix.config.cxxflags }}' + + - name: Make and Install Tesseract + run: | + ${{ matrix.config.cxx }} --version + cmake --build build --config Release -j + + - name: Download fonts, tessdata and langdata required for tests + run: | + git clone https://github.com/egorpugin/tessdata tessdata_unittest + cp tessdata_unittest/fonts/* test/testing/ + mv tessdata_unittest/* ../ + + - name: Make and run Unit Tests + run: | + cd build + ctest -C Release --output-on-failure -O test-suite.log -j 4 + + - name: Display Unit Tests Report and Compiler Version + run: | + cat test-suite.log + ${{ matrix.config.cxx }} --version + git log -3 --pretty=format:'%h %ad %s | %an' + if: always() diff --git a/.github/workflows/unittest-disablelegacy.yml b/.github/workflows/unittest-disablelegacy.yml index fef3e61a92..72b7319747 100644 --- a/.github/workflows/unittest-disablelegacy.yml +++ b/.github/workflows/unittest-disablelegacy.yml @@ -15,8 +15,8 @@ jobs: strategy: fail-fast: false matrix: - compiler: [ g++, clang++-15 ] - os: [ ubuntu-22.04 ] + compiler: [ g++, clang++-18 ] + os: [ ubuntu-24.04 ] steps: - uses: actions/checkout@v4 @@ -32,7 +32,6 @@ jobs: - name: Setup run: | - mkdir -p m4 ./autogen.sh - name: Configure diff --git a/.github/workflows/unittest-macos.yml b/.github/workflows/unittest-macos.yml index d4414f583c..e34fae90b6 100644 --- a/.github/workflows/unittest-macos.yml +++ b/.github/workflows/unittest-macos.yml @@ -14,8 +14,8 @@ jobs: matrix: config: - { name: macos-arm-14-clang-unittest, os: macos-14, cxx: clang++ } # Apple silicon - - { name: macos-12-clang-unittest, os: macos-12, cxx: clang++ } - - { name: macos-12-gcc-unittest, os: macos-12, cxx: g++ } + - { name: macos-latest-clang-unittest, os: macos-latest, cxx: clang++ } + - { name: macos-latest-gcc-unittest, os: macos-latest, cxx: g++ } steps: - uses: actions/checkout@v4 @@ -24,13 +24,10 @@ jobs: - name: Install dependencies (macOS Homebrew) run: | - brew install autoconf automake libarchive - brew install leptonica cairo pango - brew install cabextract libtool - + brew install autoconf automake cabextract libtool + brew install curl icu4c leptonica libarchive pango - name: Setup run: | - mkdir -p m4 ./autogen.sh - name: Configure (macOS Homebrew) diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 2143f64a45..ccb4468a56 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -3,6 +3,14 @@ name: unittest # ubuntu-20.04-gcc-unittest - CI runs out of diskspace. on: push: + pull_request: + paths: + - '**.cpp' + - '**.h' + - '**Makefile.am' + - '/configure.ac' + - 'unittest/**.c' + - 'unittest/**.cc' #schedule: # - cron: 0 0 * * * workflow_dispatch: @@ -15,7 +23,7 @@ jobs: fail-fast: false matrix: config: - - { name: ubuntu-20.04-gcc-unittest, os: ubuntu-20.04, cxx: g++, cxxflags: '-g -O2 -fsanitize=address,undefined' } + - { name: ubuntu-24.04-gcc-unittest, os: ubuntu-24.04, cxx: g++, cxxflags: '-g -O2 -fsanitize=address,undefined' } - { name: ubuntu-22.04-clang-unittest, os: ubuntu-22.04, cxx: clang++, cxxflags: '-g -O2 -fsanitize=address,undefined -stdlib=libc++' } steps: - uses: actions/checkout@v4 @@ -39,7 +47,6 @@ jobs: - name: Setup run: | - mkdir -p m4 ./autogen.sh - name: Configure (Linux) diff --git a/CMakeLists.txt b/CMakeLists.txt index faa1fa1564..aafe446dc7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -288,7 +288,7 @@ endif() # Compiler specific environment if(CMAKE_COMPILER_IS_GNUCXX OR MINGW) set(CMAKE_CXX_FLAGS_DEBUG - "${CMAKE_CXX_FLAGS_DEBUG} -Wall -DDEBUG -pedantic -Og") + "${CMAKE_CXX_FLAGS_DEBUG} -Wall -DDEBUG -pedantic -Og -Wno-unknown-pragmas") elseif(MSVC) add_definitions(-D_CRT_SECURE_NO_WARNINGS) add_definitions(-D_CRT_NONSTDC_NO_DEPRECATE) # strdup @@ -312,6 +312,10 @@ elseif(MSVC) set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") message(STATUS "Building with static CRT.") endif() + # Workaround: When building on VS 2022 17.10 or newer, but using an older runtime, + # mutexes can crash + # https://stackoverflow.com/questions/78598141/first-stdmutexlock-crashes-in-application-built-with-latest-visual-studio + add_definitions(-D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR) endif() if(CLANG) # clang all platforms set(CMAKE_CXX_FLAGS_RELEASE @@ -326,7 +330,10 @@ if(OPENMP_BUILD set(OPENMP_BUILD OFF) endif() if(OPENMP_BUILD) - find_package(OpenMP QUIET) + if(MSVC) # supported from cmake 3.30 + set(OpenMP_RUNTIME_MSVC "llvm") + endif(MSVC) + find_package(OpenMP) # https://stackoverflow.com/questions/12399422 # how-to-set-linker-flags-for-openmp-in-cmakes-try-compile-function if(NOT OpenMP_FOUND @@ -525,6 +532,7 @@ message(STATUS "General configuration for Tesseract ${PACKAGE_VERSION}") message(STATUS "--------------------------------------------------------") message(STATUS "Build type: ${CMAKE_BUILD_TYPE} ${BUILD_ARCH}") message(STATUS "Compiler: ${CMAKE_CXX_COMPILER_ID}") +message(STATUS "Compiler version: ${CMAKE_CXX_COMPILER_VERSION}") message(STATUS "Used standard: C++${CMAKE_CXX_STANDARD}") message(STATUS "CXX compiler options: ${COMPILER_FLAGS}") get_directory_property(DirCompDefs COMPILE_DEFINITIONS) @@ -746,6 +754,7 @@ set(TESSERACT_SRC src/api/altorenderer.cpp src/api/pagerenderer.cpp src/api/hocrrenderer.cpp + src/api/jsonrenderer.cpp src/api/lstmboxrenderer.cpp src/api/pdfrenderer.cpp src/api/wordstrboxrenderer.cpp) @@ -835,7 +844,7 @@ set_target_properties( ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}) set_target_properties( libtesseract PROPERTIES SOVERSION - ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}) + ${VERSION_MAJOR}.${VERSION_MINOR}) set_target_properties( libtesseract @@ -906,7 +915,9 @@ if(BUILD_TESTS AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/unittest/third_party/googletest/CMakeLists.txt ) + enable_testing() add_subdirectory(unittest/third_party/googletest) + add_subdirectory(unittest) endif() if(BUILD_TRAINING_TOOLS) diff --git a/ChangeLog b/ChangeLog index a331550d2d..9e7ec162cf 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +2024-11-10 - V5.5.0 +* Set hOCR capabilities ocrp_dir and ocrp_lang unconditionally. +* Calculate row bounding box in single-word mode per (issue #4304). +* Reduce clock syscalls (#4303). +* Several small performance and other code fixes. +* Modernized code. +* Print time for tessedit_timing_debug in milliseconds. +* Print time for ErrorCounter::ComputeErrorRate in milliseconds. +* cmake: Correctly set the soversion based on SemVer properties. +* Do not export PDBs for static libraries (issue #4279). +* Several other small fixes and improvements for builds and CI. +* Modernize code for renderers and remove filename conversion for Windows (#4330). +* Add build rule for Windows installer. +* Support symbolic values for --oem and --psm options. +* Remove Tensorflow support. +* Add RISC-V V support (#4346). +* Remove broken GitHub action msys2-4.1.1. + 2024-06-11 - V5.4.1 * Avoid FP overflow in NormEvidenceOf (fixes issue #4257) (#4259) * Small build fixes and code improvements (#4262, #4263, #4266, #4267) diff --git a/Makefile.am b/Makefile.am index ec59b8708c..62b699de76 100644 --- a/Makefile.am +++ b/Makefile.am @@ -26,7 +26,6 @@ uninstall-hook: rm -rf $(DESTDIR)$(pkgincludedir) dist-hook: -# Need to remove .svn directories from directories # added using EXTRA_DIST. $(distdir)/tessdata would in # theory suffice. rm -rf `find $(distdir) -name .deps -type d` @@ -112,7 +111,6 @@ lib_LTLIBRARIES = libtesseract.la libtesseract_la_LDFLAGS = $(LEPTONICA_LIBS) libtesseract_la_LDFLAGS += $(libarchive_LIBS) libtesseract_la_LDFLAGS += $(libcurl_LIBS) -libtesseract_la_LDFLAGS += $(TENSORFLOW_LIBS) if T_WIN libtesseract_la_LDFLAGS += -no-undefined -lws2_32 else @@ -219,6 +217,24 @@ libtesseract_neon_la_CXXFLAGS += -DHAVE_HWCAP_BASED_NEON_RUNTIME_DETECTION endif endif +if HAVE_RVV +libtesseract_rvv_la_CXXFLAGS = $(RVV_CXXFLAGS) +libtesseract_rvv_la_CXXFLAGS += -O3 +libtesseract_rvv_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil +libtesseract_rvv_la_SOURCES = src/arch/intsimdmatrixrvv.cpp +libtesseract_la_LIBADD += libtesseract_rvv.la +noinst_LTLIBRARIES += libtesseract_rvv.la +endif + +if HAVE_RVV +libtesseract_rvv_la_CXXFLAGS = $(RVV_CXXFLAGS) +libtesseract_rvv_la_CXXFLAGS += -O3 +libtesseract_rvv_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil +libtesseract_rvv_la_SOURCES = src/arch/intsimdmatrixrvv.cpp +libtesseract_la_LIBADD += libtesseract_rvv.la +noinst_LTLIBRARIES += libtesseract_rvv.la +endif + libtesseract_la_SOURCES += src/arch/intsimdmatrix.cpp libtesseract_la_SOURCES += src/arch/simddetect.cpp @@ -411,9 +427,6 @@ endif noinst_LTLIBRARIES += libtesseract_ccutil.la libtesseract_ccutil_la_SOURCES = src/ccutil/ccutil.cpp -libtesseract_ccutil_la_SOURCES += src/ccutil/clst.cpp -libtesseract_ccutil_la_SOURCES += src/ccutil/elst2.cpp -libtesseract_ccutil_la_SOURCES += src/ccutil/elst.cpp libtesseract_ccutil_la_SOURCES += src/ccutil/errcode.cpp libtesseract_ccutil_la_SOURCES += src/ccutil/fopenutf8.cpp libtesseract_ccutil_la_SOURCES += src/ccutil/serialis.cpp @@ -537,10 +550,6 @@ libtesseract_lstm_la_CPPFLAGS += -I$(top_srcdir)/src/dict libtesseract_lstm_la_CPPFLAGS += -I$(top_srcdir)/src/lstm libtesseract_lstm_la_CPPFLAGS += -I$(top_srcdir)/src/viewer libtesseract_lstm_la_CPPFLAGS += -I$(top_srcdir)/src/fmt/include -if TENSORFLOW -libtesseract_lstm_la_CPPFLAGS += -DINCLUDE_TENSORFLOW -libtesseract_lstm_la_CPPFLAGS += -I/usr/include/tensorflow -endif if !NO_TESSDATA_PREFIX libtesseract_lstm_la_CPPFLAGS += -DTESSDATA_PREFIX='"@datadir@"' endif @@ -563,7 +572,6 @@ noinst_HEADERS += src/lstm/reversed.h noinst_HEADERS += src/lstm/series.h noinst_HEADERS += src/lstm/static_shape.h noinst_HEADERS += src/lstm/stridemap.h -noinst_HEADERS += src/lstm/tfnetwork.h noinst_HEADERS += src/lstm/weightmatrix.h noinst_LTLIBRARIES += libtesseract_lstm.la @@ -584,11 +592,7 @@ libtesseract_lstm_la_SOURCES += src/lstm/reconfig.cpp libtesseract_lstm_la_SOURCES += src/lstm/reversed.cpp libtesseract_lstm_la_SOURCES += src/lstm/series.cpp libtesseract_lstm_la_SOURCES += src/lstm/stridemap.cpp -libtesseract_lstm_la_SOURCES += src/lstm/tfnetwork.cpp libtesseract_lstm_la_SOURCES += src/lstm/weightmatrix.cpp -if TENSORFLOW -libtesseract_lstm_la_SOURCES += src/lstm/tfnetwork.pb.cc -endif # Rules for src/textord. @@ -766,7 +770,6 @@ tesseract_LDFLAGS = $(OPENMP_CXXFLAGS) tesseract_LDADD = libtesseract.la tesseract_LDADD += $(LEPTONICA_LIBS) -tesseract_LDADD += $(TENSORFLOW_LIBS) tesseract_LDADD += $(libarchive_LIBS) tesseract_LDADD += $(libcurl_LIBS) @@ -937,7 +940,6 @@ EXTRA_PROGRAMS += $(trainingtools) extralib = libtesseract.la extralib += $(libarchive_LIBS) extralib += $(LEPTONICA_LIBS) -extralib += $(TENSORFLOW_LIBS) if T_WIN extralib += -lws2_32 endif @@ -1051,7 +1053,6 @@ fuzzer-api-512x256: fuzzer-api $< \ $(builddir)/.libs/libtesseract.a \ $(LEPTONICA_LIBS) \ - $(TENSORFLOW_LIBS) \ $(libarchive_LIBS) \ $(libcurl_LIBS) \ -o $@ @@ -1071,7 +1072,6 @@ fuzzer-api-512x256: unittest/fuzzers/fuzzer-api.cpp $< \ $(builddir)/.libs/libtesseract.a \ $(LEPTONICA_LIBS) \ - $(TENSORFLOW_LIBS) \ $(libarchive_LIBS) \ $(libcurl_LIBS) \ -o $@ @@ -1185,10 +1185,6 @@ endif # ENABLE_TRAINING unittest_CPPFLAGS += -I$(top_srcdir)/src/viewer unittest_CPPFLAGS += -I$(top_srcdir)/src/wordrec unittest_CPPFLAGS += -I$(top_srcdir)/unittest -if TENSORFLOW -unittest_CPPFLAGS += -DINCLUDE_TENSORFLOW -unittest_CPPFLAGS += -I/usr/include/tensorflow -endif # TENSORFLOW # Build googletest: check_LTLIBRARIES = libgtest.la libgtest_main.la libgmock.la libgmock_main.la @@ -1216,7 +1212,6 @@ GTEST_LIBS = libgtest.la libgtest_main.la -lpthread GMOCK_LIBS = libgmock.la libgmock_main.la TESS_LIBS = $(GTEST_LIBS) TESS_LIBS += libtesseract.la $(libarchive_LIBS) -TESS_LIBS += $(TENSORFLOW_LIBS) TRAINING_LIBS = libtesseract_training.la TRAINING_LIBS += $(TESS_LIBS) unittest_CPPFLAGS += -isystem $(top_srcdir)/unittest/third_party/googletest/googletest/include @@ -1479,10 +1474,6 @@ networkio_test_CPPFLAGS = $(unittest_CPPFLAGS) networkio_test_LDADD = $(TESS_LIBS) normstrngs_test_SOURCES = unittest/normstrngs_test.cc -if TENSORFLOW -normstrngs_test_SOURCES += unittest/third_party/utf/rune.c -normstrngs_test_SOURCES += unittest/util/utf8/unilib.cc -endif # TENSORFLOW normstrngs_test_CPPFLAGS = $(unittest_CPPFLAGS) normstrngs_test_LDADD = $(TRAINING_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS) @@ -1501,14 +1492,9 @@ pagesegmode_test_CPPFLAGS = $(unittest_CPPFLAGS) pagesegmode_test_LDADD = $(TRAINING_LIBS) $(LEPTONICA_LIBS) pango_font_info_test_SOURCES = unittest/pango_font_info_test.cc -if TENSORFLOW -pango_font_info_test_SOURCES += unittest/third_party/utf/rune.c -pango_font_info_test_SOURCES += unittest/util/utf8/unicodetext.cc -pango_font_info_test_SOURCES += unittest/util/utf8/unilib.cc -endif # TENSORFLOW pango_font_info_test_CPPFLAGS = $(unittest_CPPFLAGS) pango_font_info_test_LDADD = $(TRAINING_LIBS) $(LEPTONICA_LIBS) -pango_font_info_test_LDADD += $(ICU_I18N_LIBS) +pango_font_info_test_LDADD += $(ICU_I18N_LIBS) $(ICU_UC_LIBS) pango_font_info_test_LDADD += $(pangocairo_LIBS) pango_font_info_test_LDADD += $(pangoft2_LIBS) diff --git a/README.md b/README.md index 7bf9b6cbee..a5a090a457 100644 --- a/README.md +++ b/README.md @@ -25,11 +25,9 @@ Here's the original feature request upstream: https://github.com/tesseract-ocr/t # Tesseract OCR -[![Build status](https://ci.appveyor.com/api/projects/status/miah0ikfsf0j3819/branch/master?svg=true)](https://ci.appveyor.com/project/zdenop/tesseract/) -[![Build status](https://github.com/tesseract-ocr/tesseract/actions/workflows/sw.yml/badge.svg)](https://github.com/tesseract-ocr/tesseract/actions/workflows/sw.yml)\ [![Coverity Scan Build Status](https://scan.coverity.com/projects/tesseract-ocr/badge.svg)](https://scan.coverity.com/projects/tesseract-ocr) [![CodeQL](https://github.com/tesseract-ocr/tesseract/workflows/CodeQL/badge.svg)](https://github.com/tesseract-ocr/tesseract/security/code-scanning) -[![OSS-Fuzz](https://img.shields.io/badge/oss--fuzz-fuzzing-brightgreen)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=2&q=proj:tesseract-ocr) +[![OSS-Fuzz](https://img.shields.io/badge/oss--fuzz-fuzzing-brightgreen)](https://issues.oss-fuzz.com/issues?q=is:open%20title:tesseract-ocr) \ [![GitHub license](https://img.shields.io/badge/license-Apache--2.0-blue.svg)](https://raw.githubusercontent.com/tesseract-ocr/tesseract/main/LICENSE) [![Downloads](https://img.shields.io/badge/download-all%20releases-brightgreen.svg)](https://github.com/tesseract-ocr/tesseract/releases/) diff --git a/VERSION b/VERSION index ade65226e0..d50359de18 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -5.4.1 +5.5.0 diff --git a/configure.ac b/configure.ac index fb8e8caf8d..46817ea48c 100644 --- a/configure.ac +++ b/configure.ac @@ -29,7 +29,7 @@ AM_INIT_AUTOMAKE([foreign subdir-objects nostdinc]) # Define date of package, etc. Could be useful in auto-generated # documentation. PACKAGE_YEAR=2024 -PACKAGE_DATE="06/11" +PACKAGE_DATE="11/10" abs_top_srcdir=`AS_DIRNAME([$0])` @@ -133,6 +133,7 @@ AM_CONDITIONAL([HAVE_AVX512VNNI], false) AM_CONDITIONAL([HAVE_FMA], false) AM_CONDITIONAL([HAVE_SSE4_1], false) AM_CONDITIONAL([HAVE_NEON], false) +AM_CONDITIONAL([HAVE_RVV], false) case "${host_cpu}" in @@ -206,6 +207,16 @@ case "${host_cpu}" in ;; + riscv*) + + AX_CHECK_COMPILE_FLAG([-march=rv64gcv], [rvv=true], [rvv=false], [$WERROR]) + AM_CONDITIONAL([HAVE_RVV], [$rvv]) + if $rvv; then + AC_DEFINE([HAVE_RVV], [1], [Enable RVV instructions]) + check_for_rvv=1 + fi + ;; + *) AC_MSG_WARN([No compiler options for $host_cpu]) @@ -225,6 +236,16 @@ if test x$check_for_neon = x1; then fi fi +# additional checks for RVV targets +if test x$check_for_rvv = x1; then + AC_MSG_NOTICE([checking how to detect RVV availability]) + AC_CHECK_FUNCS([getauxval]) + + if test $ac_cv_func_getauxval = no; then + AC_MSG_WARN([RVV is available, but we don't know how to check for it. Will not be able to use RVV.]) + fi +fi + AX_CHECK_COMPILE_FLAG([-fopenmp-simd], [openmp_simd=true], [openmp_simd=false], [$WERROR]) AM_CONDITIONAL([OPENMP_SIMD], $openmp_simd) @@ -300,25 +321,6 @@ AC_ARG_WITH([curl], AS_HELP_STRING([--with-curl], [Build with libcurl which supports processing an image URL @<:@default=check@:>@]), [], [with_curl=check]) -AC_ARG_WITH([tensorflow], - AS_HELP_STRING([--with-tensorflow], - [support TensorFlow @<:@default=check@:>@]), - [], [with_tensorflow=check]) - -# Check whether to build with support for TensorFlow. -AM_CONDITIONAL([TENSORFLOW], false) -TENSORFLOW_LIBS= -AS_IF([test "x$with_tensorflow" != xno], - [AC_CHECK_HEADERS([tensorflow/core/framework/graph.pb.h], - [AC_SUBST([TENSORFLOW_LIBS], ["-lprotobuf -ltensorflow_cc"]) - AM_CONDITIONAL([TENSORFLOW], true) - ], - [if test "x$with_tensorflow" != xcheck; then - AC_MSG_FAILURE( - [--with-tensorflow was given, but test for libtensorflow-dev failed]) - fi - ]) - ]) # https://lists.apple.com/archives/unix-porting/2009/Jan/msg00026.html m4_define([MY_CHECK_FRAMEWORK], diff --git a/doc/tesseract.1.asc b/doc/tesseract.1.asc index cb5d8837d2..b4730ea18e 100644 --- a/doc/tesseract.1.asc +++ b/doc/tesseract.1.asc @@ -15,7 +15,7 @@ DESCRIPTION tesseract(1) is a commercial quality OCR engine originally developed at HP between 1985 and 1995. In 1995, this engine was among the top 3 evaluated by UNLV. It was open-sourced by HP and UNLV in 2005, and has been developed -at Google since then. +at Google until 2018. IN/OUT ARGUMENTS diff --git a/include/tesseract/baseapi.h b/include/tesseract/baseapi.h index 588e2121ef..49c09eddce 100644 --- a/include/tesseract/baseapi.h +++ b/include/tesseract/baseapi.h @@ -306,10 +306,6 @@ class TESS_API TessBaseAPI { */ void DumpVariables(FILE *fp) const; - // Functions added by Tesseract.js-core to save and restore parameters - void SaveParameters(); - void RestoreParameters(); - /** * Get value of named variable as a string, if it exists. */ @@ -827,6 +823,25 @@ class TESS_API TessBaseAPI { */ char *GetHOCRText(int page_number); + /** + * Make a JSON-formatted string with JSON from the internal + * data structures. + * page_number is 0-based but will appear in the output as 1-based. + * monitor can be used to + * cancel the recognition + * receive progress callbacks + * Returned string must be freed with the delete [] operator. + */ + char *GetJSONText(ETEXT_DESC *monitor, int page_number); + + /** + * Make a JSON-formatted string with JSON from the internal + * data structures. + * page_number is 0-based but will appear in the output as 1-based. + * Returned string must be freed with the delete [] operator. + */ + char *GetJSONText(int page_number); + /** * Make an XML-formatted string with Alto markup from the internal * data structures. diff --git a/include/tesseract/image.h b/include/tesseract/image.h index 9e4322487c..d59fd0d564 100644 --- a/include/tesseract/image.h +++ b/include/tesseract/image.h @@ -55,7 +55,9 @@ class TESS_API Image { // api Pix *clone2pix() const; // increases refcount - Image cccclone() const; // increases refcount +#if 0 + Image clone() const; // increases refcount +#endif Image copy() const; // does full copy void destroy(); bool isZero() const; diff --git a/include/tesseract/preparation.h b/include/tesseract/preparation.h index 76f596a6f9..c058c4b2ea 100644 --- a/include/tesseract/preparation.h +++ b/include/tesseract/preparation.h @@ -1,5 +1,7 @@ -#define _USE_MATH_DEFINES // for M_PI, when you load math.h +#if defined(_MSC_VER) && !defined(_USE_MATH_DEFINES) +#error "tesseract needs you to define _USE_MATH_DEFINES when compiling with MSVC to get access to M_PI et al on the Win32 platform." +#endif // Include automatically generated configuration file if running autoconf. #ifdef HAVE_TESSERACT_CONFIG_H diff --git a/include/tesseract/renderer.h b/include/tesseract/renderer.h index 7c4f79bc99..c4dda098ca 100644 --- a/include/tesseract/renderer.h +++ b/include/tesseract/renderer.h @@ -182,6 +182,20 @@ class TESS_API TessHOcrRenderer : public TessResultRenderer { bool font_info_; // whether to print font information }; +/** + * Renders tesseract output into an json text string + */ +class TESS_API TessJsonRenderer : public TessResultRenderer { +public: + explicit TessJsonRenderer(const char *outputbase); + +protected: + bool BeginDocumentHandler() override; + bool AddImageHandler(TessBaseAPI *api) override; + bool EndDocumentHandler() override; +}; + + /** * Renders tesseract output into an alto text string */ diff --git a/include/tesseract/tprintf.h b/include/tesseract/tprintf.h index 83467a98af..7deab26b39 100644 --- a/include/tesseract/tprintf.h +++ b/include/tesseract/tprintf.h @@ -47,17 +47,17 @@ void tprintError(const S *format, Args &&...args) { template void tprintWarn(const S *format, Args &&...args) { - vTessPrint(T_LOG_WARN, format, fmt::make_format_args(args...)); + vTessPrint(T_LOG_WARN, format, fmt::make_format_args(args...)); } template void tprintInfo(const S *format, Args &&...args) { - vTessPrint(T_LOG_INFO, format, fmt::make_format_args(args...)); + vTessPrint(T_LOG_INFO, format, fmt::make_format_args(args...)); } template void tprintDebug(const S *format, Args &&...args) { - vTessPrint(T_LOG_DEBUG, format, fmt::make_format_args(args...)); + vTessPrint(T_LOG_DEBUG, format, fmt::make_format_args(args...)); } template @@ -65,14 +65,16 @@ void tprintTrace(const S *format, Args &&...args) { vTessPrint(T_LOG_TRACE, format, fmt::make_format_args(args...)); } + ///////////////////////////////////////////////////////////////////////////////// // Signal the tprintf line gatherer that the next lines printed, even when terminated // by a '\n' newline, are to be kept together as a single pack, a single message. // // Any such grouping is ended by the class instance going out of scope (and its destructor -// being invoked to produce the desired 'side effect') or the grouping is broken up -// when a different log level message zips through: errors break up warnings/info/debug info, etc. +// being invoked to produce the desired 'side effect'). +// Also note that the grouping is broken up when a different log level message zips through: +// errors break up warnings/info/debug info, etc. // // Anyway, this class only lives for its side effects in tprint log channel: class TPrintGroupLinesTillEndOfScope { diff --git a/java/Makefile.am b/java/Makefile.am index 8e5610204d..92b91b220d 100644 --- a/java/Makefile.am +++ b/java/Makefile.am @@ -51,9 +51,9 @@ $(SCROLLVIEW_CLASSES) : $(SCROLLVIEW_FILES) $(SCROLLVIEW_LIBS) .PHONY: fetch-jars fetch-jars $(SCROLLVIEW_LIBS): - curl -s -S -L -o piccolo2d-core-3.0.1.jar https://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-core/3.0.1/piccolo2d-core-3.0.1.jar - curl -s -S -L -o piccolo2d-extras-3.0.1.jar https://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-extras/3.0.1/piccolo2d-extras-3.0.1.jar - curl -s -S -L -o jaxb-api-2.3.1.jar https://search.maven.org/remotecontent?filepath=javax/xml/bind/jaxb-api/2.3.1/jaxb-api-2.3.1.jar + curl -sSLO https://repo1.maven.org/maven2/org/piccolo2d/piccolo2d-core/3.0.1/piccolo2d-core-3.0.1.jar + curl -sSLO https://repo1.maven.org/maven2/org/piccolo2d/piccolo2d-extras/3.0.1/piccolo2d-extras-3.0.1.jar + curl -sSLO https://repo1.maven.org/maven2/javax/xml/bind/jaxb-api/2.3.1/jaxb-api-2.3.1.jar .PHONY: install-jars install-jars : ScrollView.jar diff --git a/nsis/Makefile.am b/nsis/Makefile.am index cae33db56c..917f1431e7 100644 --- a/nsis/Makefile.am +++ b/nsis/Makefile.am @@ -8,7 +8,15 @@ gitrev="$(shell git --git-dir=${abs_top_srcdir}/.git --work-tree=${abs_top_srcdi .PHONY: winsetup -winsetup: +Plugins/x86-unicode/INetC.dll: + curl -OsS https://nsis.sourceforge.io/mediawiki/images/c/c9/Inetc.zip + unzip Inetc.zip $@ + +winpath.exe: winpath.cpp + x86_64-w64-mingw32-g++ -Os -o $@ $< + x86_64-w64-mingw32-strip --strip-unneeded $@ + +winsetup: Plugins/x86-unicode/INetC.dll winpath.exe makensis -DCROSSBUILD -DSHARED -DSIGNCODE=$(SIGNCODE) -DSRCDIR=$(top_srcdir) -DVERSION=${gitrev} $(shell test "$(host_cpu)" = x86_64 && echo "-DW64") -NOCD $(top_srcdir)/nsis/tesseract.nsi endif diff --git a/nsis/build.sh b/nsis/build.sh new file mode 100755 index 0000000000..49245a1b43 --- /dev/null +++ b/nsis/build.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +# GitHub actions - Create Tesseract installer for Windows + +# Author: Stefan Weil (2010-2024) + +set -e +set -x + +LANG=C.UTF-8 + +ARCH=$1 + +if [ "$ARCH" = "i686" ]; then + MINGW=/mingw32 +else + ARCH=x86_64 + MINGW=/mingw64 +fi + +ROOTDIR=$PWD +DISTDIR=$ROOTDIR/dist +HOST=$ARCH-w64-mingw32 +TAG=$(cat VERSION).$(date +%Y%m%d) +BUILDDIR=bin/ndebug/$HOST-$TAG +PKG_ARCH=mingw-w64-${ARCH/_/-} + +# Install packages. +sudo apt-get update --quiet +sudo apt-get install --assume-yes --no-install-recommends --quiet \ + asciidoc curl xsltproc docbook-xml docbook-xsl \ + automake dpkg-dev libtool pkg-config default-jdk-headless \ + mingw-w64-tools nsis g++-"$PKG_ARCH" \ + makepkg pacman-package-manager python3-venv unzip + +# Configure pacman. + +# Enable mirrorlist. +sudo sed -Ei 's/^#.*(Include.*mirrorlist)/\1/' /etc/pacman.conf +( +# Add msys key for pacman. +cd /usr/share/keyrings +sudo curl -OsS https://raw.githubusercontent.com/msys2/MSYS2-keyring/master/msys2.gpg +sudo curl -OsS https://raw.githubusercontent.com/msys2/MSYS2-keyring/master/msys2-revoked +sudo curl -OsS https://raw.githubusercontent.com/msys2/MSYS2-keyring/master/msys2-trusted +) +( +# Add active environments for pacman. +# See https://www.msys2.org/docs/repos-mirrors/. +sudo mkdir -p /etc/pacman.d +cd /etc/pacman.d +cat </dev/null +[mingw64] +Include = /etc/pacman.d/mirrorlist.mingw +eod +sudo curl -OsS https://raw.githubusercontent.com/msys2/MSYS2-packages/master/pacman-mirrors/mirrorlist.mingw +# sudo curl -OsS https://raw.githubusercontent.com/msys2/MSYS2-packages/master/pacman-mirrors/mirrorlist.msys +) + +sudo pacman-key --init +sudo pacman-key --populate msys2 +sudo pacman -Syu --noconfirm + +# Install required pacman packages. +sudo pacman -S --noconfirm \ + mingw-w64-x86_64-curl-winssl \ + mingw-w64-x86_64-giflib \ + mingw-w64-x86_64-icu \ + mingw-w64-x86_64-leptonica \ + mingw-w64-x86_64-libarchive \ + mingw-w64-x86_64-libidn2 \ + mingw-w64-x86_64-openjpeg2 \ + mingw-w64-x86_64-openssl \ + mingw-w64-x86_64-pango \ + mingw-w64-x86_64-libpng \ + mingw-w64-x86_64-libtiff \ + mingw-w64-x86_64-libwebp + +git config --global user.email "sw@weilnetz.de" +git config --global user.name "Stefan Weil" +git tag -a "v$TAG" -m "Tesseract $TAG" + +# Run autogen. +./autogen.sh + +# Build Tesseract installer. +mkdir -p "$BUILDDIR" && cd "$BUILDDIR" + +# Run configure. +PKG_CONFIG_PATH=$MINGW/lib/pkgconfig +export PKG_CONFIG_PATH +# Disable OpenMP (see https://github.com/tesseract-ocr/tesseract/issues/1662). +../../../configure --disable-openmp --host="$HOST" --prefix="/usr/$HOST" \ + CXX="$HOST-g++-posix" \ + CXXFLAGS="-fno-math-errno -Wall -Wextra -Wpedantic -g -O2 -isystem $MINGW/include" \ + LDFLAGS="-L$MINGW/lib" + +make all training +MINGW_INSTALL=${PWD}${MINGW} +make install-jars install training-install html prefix="$MINGW_INSTALL" INSTALL_STRIP_FLAG=-s +test -d venv || python3 -m venv venv +source venv/bin/activate +pip install pefile +mkdir -p dll +ln -sv $("$ROOTDIR/nsis/find_deps.py" "$MINGW_INSTALL"/bin/*.exe "$MINGW_INSTALL"/bin/*.dll) dll/ +ln -svf /usr/lib/gcc/x86_64-w64-mingw32/*-win32/libstdc++-6.dll dll/ +ln -svf /usr/lib/gcc/x86_64-w64-mingw32/*-win32/libgcc_s_seh-1.dll dll/ +make winsetup prefix="$MINGW_INSTALL" + +# Copy result for upload. +mkdir -p "$DISTDIR" && cp nsis/tesseract-ocr-w*-setup-*.exe "$DISTDIR" diff --git a/nsis/find_deps.py b/nsis/find_deps.py new file mode 100755 index 0000000000..0aa12a91b7 --- /dev/null +++ b/nsis/find_deps.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# +# Copyright (C) 2024 Stefan Weil +# +# SPDX-License-Identifier: MIT +# +# Find the DLL files which are required for a given set of +# Windows executables and libraries. + +import argparse +import os +import pefile + +VERBOSE = False + +def find_dependencies(binary, search_path, analyzed_deps): + pe = pefile.PE(binary) + pe.parse_data_directories() + if VERBOSE: + print(f'{binary}:') + # print(pe.dump_info()) + + for entry in pe.DIRECTORY_ENTRY_IMPORT: + name = entry.dll.decode('utf-8') + if name in analyzed_deps: + if VERBOSE: + print(f'skip {name} (already analyzed)') + continue + analyzed_deps.add(name) + fullpath = os.path.join(search_path, name) + if not os.path.exists(fullpath): + # Not found, maybe system DLL. Skip it. + if VERBOSE: + print(f'skip {name} (not found, maybe system DLL)') + continue + print(fullpath) + analyzed_deps = find_dependencies(fullpath, search_path, analyzed_deps) + + return analyzed_deps + +def main(): + """ + Command-line interface for universal dependency scanner. + """ + + parser = argparse.ArgumentParser(description='Find and copy DLL dependencies') + parser.add_argument('files', nargs='+', help='Paths to executable or library files') + parser.add_argument('--dlldir', dest='dlldir', default='/mingw64/bin/', + help='path to dll files') + + args = parser.parse_args() + + # try: + # Find dependencies + analyzed_deps = set() + for binary in args.files: + if True: + analyzed_deps = find_dependencies(binary, args.dlldir, analyzed_deps) + # except: + # print(f'error: failed to find dependencies for {binary}') + + +if __name__ == '__main__': + main() diff --git a/nsis/tesseract.nsi b/nsis/tesseract.nsi index 104abb85ed..1fcead7d3d 100644 --- a/nsis/tesseract.nsi +++ b/nsis/tesseract.nsi @@ -1,6 +1,6 @@ ; (C) Copyright 2010, Sergey Bronnikov ; (C) Copyright 2010-2012, Zdenko Podobný -; (C) Copyright 2015-2023 Stefan Weil +; (C) Copyright 2015-2024 Stefan Weil ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. @@ -29,6 +29,12 @@ Unicode true ;define CROSSBUILD ;define SHARED ;define W64 +!ifndef COMMENTS +!define COMMENTS "GitHub CI build" +!endif +!ifndef COMPANYNAME +!define COMPANYNAME "Open Source Community" +!endif !ifndef SRCDIR !define SRCDIR . !endif @@ -43,13 +49,11 @@ Unicode true !define PRODUCT_WEB_SITE "https://github.com/tesseract-ocr/tesseract" !endif !define GITHUB_RAW_FILE_URL \ - "http://digi.bib.uni-mannheim.de/tesseract/tessdata_fast" -# "http://digi.bib.uni-mannheim.de/tesseract/tessdata_fast" -# "https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/main" + "https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/main" !ifdef CROSSBUILD !addincludedir ${SRCDIR}\nsis\include -!addplugindir ${SRCDIR}\nsis\plugins +!addplugindir Plugins/x86-unicode !endif !ifdef W64 @@ -72,7 +76,7 @@ OutFile ${OUTFILE} !ifndef PREFIX !define PREFIX "../mingw64" !endif -!define TRAININGDIR "${PREFIX}/bin" +!define BINDIR "${PREFIX}/bin" # General Definitions Name "${PRODUCT_NAME}" @@ -85,8 +89,8 @@ BrandingText /TRIMCENTER "(c) 2010-2019 ${PRODUCT_NAME}" !define /date DATEVERSION "%Y%m%d%H%M%S" VIProductVersion "${VERSION}" VIAddVersionKey "ProductName" "${PRODUCT_NAME}" -VIAddVersionKey "Comments" "patched version provided by Stefan Weil" -VIAddVersionKey "CompanyName" "Universitätsbibliothek Mannheim" +VIAddVersionKey "Comments" "${COMMENTS}" +VIAddVersionKey "CompanyName" "${COMPANYNAME}" VIAddVersionKey "FileDescription" "Tesseract OCR" !define /date DATETIME "%Y-%m-%d-%H-%M-%S" VIAddVersionKey "FileVersion" "${DATETIME}" @@ -256,12 +260,12 @@ Section -Main SEC0000 SectionIn RO SetOutPath "$INSTDIR" # files included in distribution - File ${PREFIX}/bin/tesseract.exe - File ${PREFIX}/bin/libtesseract-*.dll + File ${BINDIR}/tesseract.exe + File ${BINDIR}/libtesseract-*.dll !ifdef CROSSBUILD File ../dll/*.dll !endif - File ${SRCDIR}\nsis\winpath.exe + File winpath.exe File ../doc/*.html CreateDirectory "$INSTDIR\tessdata" SetOutPath "$INSTDIR\tessdata" @@ -290,7 +294,7 @@ SectionEnd Section "Training Tools" SecTr SectionIn 1 SetOutPath "$INSTDIR" - File ${TRAININGDIR}\*.exe + File /x tesseract.exe ${BINDIR}/*.exe SectionEnd !define UNINST_EXE "$INSTDIR\tesseract-uninstall.exe" @@ -353,14 +357,12 @@ SectionGroupEnd SectionGroup "Language data" SecGrp_LD Section "English" SecLang_eng SectionIn RO - SetOutPath "$INSTDIR\tessdata" - File ${SRCDIR}\tessdata\eng.* + !insertmacro Download_Lang_Data eng SectionEnd Section "Orientation and script detection" SecLang_osd SectionIn 1 - SetOutPath "$INSTDIR\tessdata" - File ${SRCDIR}\tessdata\osd.* + !insertmacro Download_Lang_Data osd SectionEnd SectionGroupEnd diff --git a/nsis/winpath.cpp b/nsis/winpath.cpp index 7b8f64e9ea..da8b2ce844 100644 --- a/nsis/winpath.cpp +++ b/nsis/winpath.cpp @@ -1,3 +1,20 @@ +// Copyright (C) 2024 Stefan Weil +// +// SPDX-License-Identifier: Apache-2.0 +// +// winpath - run a Windows program with extended PATH +// +// Usage: +// +// winpath [CMD [ARGUMENT ...]] +// +// Example: +// +// winpath cmd +// +// This will start a Windows command line with PATH extended by +// the location of the winpath executable. + #include // _spawnvp #include // _putenv_s #include // strcpy, strcat diff --git a/src/api/altorenderer.cpp b/src/api/altorenderer.cpp index 349c69f9d7..5bf2c39ce1 100644 --- a/src/api/altorenderer.cpp +++ b/src/api/altorenderer.cpp @@ -33,7 +33,7 @@ namespace tesseract { /// Add coordinates to specified TextBlock, TextLine or String bounding box. /// Add word confidence if adding to a String bounding box. /// -static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level, +static void AddBoxToAlto(const std::unique_ptr &it, PageIteratorLevel level, std::stringstream &alto_str) { int left, top, right, bottom; it->BoundingBox(level, &left, &top, &right, &bottom); @@ -138,7 +138,6 @@ char *TessBaseAPI::GetAltoText(int page_number) { if (tesseract_->input_file_path_.empty()) { SetInputName(nullptr); } - std::stringstream alto_str; // Use "C" locale (needed for int values larger than 999). alto_str.imbue(std::locale::classic()); @@ -149,7 +148,7 @@ char *TessBaseAPI::GetAltoText(int page_number) { << " WIDTH=\"" << rect_width_ << "\"" << " HEIGHT=\"" << rect_height_ << "\">\n"; - ResultIterator *res_it = GetIterator(); + std::unique_ptr res_it(GetIterator()); while (!res_it->Empty(RIL_BLOCK)) { if (res_it->Empty(RIL_WORD)) { res_it->Next(RIL_WORD); @@ -167,7 +166,7 @@ char *TessBaseAPI::GetAltoText(int page_number) { // // TODO: optionally add TYPE, for example TYPE="photo". alto_str << "\t\t\t\t\n"; res_it->Next(RIL_BLOCK); continue; @@ -176,7 +175,7 @@ char *TessBaseAPI::GetAltoText(int page_number) { case PT_VERT_LINE: // Handle horizontal and vertical lines. alto_str << "\t\t\t\t\n"; res_it->Next(RIL_BLOCK); continue; @@ -189,24 +188,24 @@ char *TessBaseAPI::GetAltoText(int page_number) { if (res_it->IsAtBeginningOf(RIL_BLOCK)) { alto_str << "\t\t\t\tIsAtBeginningOf(RIL_PARA)) { alto_str << "\t\t\t\t\tIsAtBeginningOf(RIL_TEXTLINE)) { alto_str << "\t\t\t\t\t\tIsAtFinalElement(RIL_TEXTLINE, RIL_WORD); @@ -253,7 +252,6 @@ char *TessBaseAPI::GetAltoText(int page_number) { alto_str << "\t\t\t\n" << "\t\t\n"; - delete res_it; return copy_string(alto_str.str()); } diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp index 5e44b5c7c7..438a9462de 100644 --- a/src/api/baseapi.cpp +++ b/src/api/baseapi.cpp @@ -45,7 +45,7 @@ #include "polyblk.h" // for POLY_BLOCK #include "rect.h" // for TBOX #include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST -#include "tessdatamanager.h" // for TessdataManager, kTrainedDataSuffix +#include "tessdatamanager.h" // for TessdataManager #include "tesseractclass.h" // for Tesseract #include // for tprintf #include "werd.h" // for WERD, WERD_IT, W_FUZZY_NON, W_FUZZY_SP @@ -73,7 +73,7 @@ #include // for round, M_PI #include // for int32_t #include // for strcmp, strcpy -#include // for path +#include // for std::filesystem #include // for size_t #include // for std::cin #include // for std::locale::classic @@ -176,63 +176,22 @@ FZ_HEAPDBG_TRACKER_SECTION_END_MARKER(_) /* Add all available languages recursively. */ -static void addAvailableLanguages(const std::string &datadir, const std::string &base, +static void addAvailableLanguages(const std::string &datadir, std::vector *langs) { - auto base2 = base; - if (!base2.empty()) { - base2 += "/"; + if (!std::filesystem::is_directory(datadir)) { + tprintError("The directory '{}' does not exist.\n", datadir); + return; } - const size_t extlen = sizeof(kTrainedDataSuffix); -#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) - const auto kTrainedDataSuffixUtf16 = winutils::Utf8ToUtf16(kTrainedDataSuffix); - - WIN32_FIND_DATAW data; - HANDLE handle = FindFirstFileW(winutils::Utf8ToUtf16((datadir + base2 + "*").c_str()).c_str(), &data); - if (handle != INVALID_HANDLE_VALUE) { - BOOL result = TRUE; - for (; result;) { - wchar_t *name = data.cFileName; - // Skip '.', '..', and hidden files - if (name[0] != '.') { - if ((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) == FILE_ATTRIBUTE_DIRECTORY) { - addAvailableLanguages(datadir, base2 + winutils::Utf16ToUtf8(name), langs); - } else { - size_t len = wcslen(name); - if (len > extlen && name[len - extlen] == '.' && - wcscmp(&name[len - extlen + 1], kTrainedDataSuffixUtf16.c_str()) == 0) { - name[len - extlen] = '\0'; - langs->push_back(base2 + winutils::Utf16ToUtf8(name)); - } - } - } - result = FindNextFileW(handle, &data); - } - FindClose(handle); - } -#else // _WIN32 - DIR *dir = opendir((datadir + base).c_str()); - if (dir != nullptr) { - dirent *de; - while ((de = readdir(dir))) { - char *name = de->d_name; - // Skip '.', '..', and hidden files - if (name[0] != '.') { - struct stat st; - if (stat((datadir + base2 + name).c_str(), &st) == 0 && (st.st_mode & S_IFDIR) == S_IFDIR) { - addAvailableLanguages(datadir, base2 + name, langs); - } else { - size_t len = strlen(name); - if (len > extlen && name[len - extlen] == '.' && - strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) { - name[len - extlen] = '\0'; - langs->push_back(base2 + name); - } - } - } + for (const auto& entry : + std::filesystem::recursive_directory_iterator(datadir, + std::filesystem::directory_options::follow_directory_symlink | + std::filesystem::directory_options::skip_permission_denied)) { + auto path = entry.path().lexically_relative(datadir).string(); + auto extPos = path.rfind(".traineddata"); + if (extPos != std::string::npos) { + langs->push_back(path.substr(0, extPos)); } - closedir(dir); } -#endif } @@ -323,7 +282,7 @@ ImageCostEstimate TessBaseAPI::EstimateImageMemoryCost(int image_width, int imag cost *= image_height; if (allowed_image_memory_capacity > 0.0) { - // any rediculous input values will be replaced by the Tesseract configuration value: + // any ridiculous input values will be replaced by the Tesseract configuration value: if (allowance > allowed_image_memory_capacity || allowance <= 0.0) allowance = allowed_image_memory_capacity; } @@ -842,7 +801,7 @@ void TessBaseAPI::GetAvailableLanguagesAsVector(std::vector *langs) langs->clear(); ASSERT_HOST(tesseract_ != nullptr); const Tesseract &tess = tesseract(); - addAvailableLanguages(tess.datadir_, "", langs); + addAvailableLanguages(tess.datadir_, langs); std::sort(langs->begin(), langs->end()); } @@ -1064,7 +1023,7 @@ Pix *TessBaseAPI::GetThresholdedImage() { // Image p1 = pixRotate(tess.pix_binary(), 0.15, L_ROTATE_SHEAR, L_BRING_IN_WHITE, 0, 0); // because we want to keep the public API as-is for now, instead of migrating it to using Image type directly, - // we downgrade to `PIX *` at the exit point, hence the reponsibility to CLONE is ours: + // we downgrade to `PIX *` at the exit point, hence the responsibility to CLONE is ours: return tess.pix_binary().clone2pix(); } @@ -1642,7 +1601,7 @@ const char * TessBaseAPI::GetVisibleImageFilename() { const char *TessBaseAPI::GetDatapath() { Tesseract &tess = tesseract(); - return tess.datadir_.c_str(); + return tess.datadir_.string().c_str(); } int TessBaseAPI::GetSourceYResolution() { diff --git a/src/api/hocrrenderer.cpp b/src/api/hocrrenderer.cpp index 76179aff1d..c8658bad45 100644 --- a/src/api/hocrrenderer.cpp +++ b/src/api/hocrrenderer.cpp @@ -134,7 +134,6 @@ char *TessBaseAPI::GetHOCRText(int page_number) { if (tesseract_->input_file_path_.empty()) { SetInputName(nullptr); } - std::stringstream hocr_str; // Use "C" locale (needed for double values x_size and x_descenders). hocr_str.imbue(std::locale::classic()); @@ -489,9 +488,9 @@ bool TessHOcrRenderer::BeginDocumentHandler() { " \n" " \n" diff --git a/src/api/jsonrenderer.cpp b/src/api/jsonrenderer.cpp new file mode 100644 index 0000000000..ba5130305c --- /dev/null +++ b/src/api/jsonrenderer.cpp @@ -0,0 +1,338 @@ +#include +#include +#include +#include +#include +#include +#include "tesseractclass.h" + +namespace tesseract { + +std::string JsonEscape(const char *text) { + std::string ret; + const char *ptr; + for (ptr = text; *ptr; ptr++) { + switch (*ptr) { + case '"': + ret += "\\\""; + break; + case '\\': + ret += "\\\\"; + break; + case '\b': + ret += "\\b"; + break; + case '\f': + ret += "\\f"; + break; + case '\n': + ret += "\\n"; + break; + case '\r': + ret += "\\r"; + break; + case '\t': + ret += "\\t"; + break; + default: + ret += *ptr; + } + } + return ret; +} + + +static void AddBoxToJson(const tesseract::ResultIterator* it, tesseract::PageIteratorLevel level, std::stringstream& json_str) { + int left, top, right, bottom; + it->BoundingBox(level, &left, &top, &right, &bottom); + json_str << "{ \"x0\": " << left << ", \"y0\": " << top << ", \"x1\": " << right << ", \"y1\": " << bottom << " }"; +} + +static void AddBaselineCoordsToJson(const tesseract::ResultIterator* it, tesseract::PageIteratorLevel level, std::stringstream& json_str) { + int left, top, right, bottom; + it->BoundingBox(level, &left, &top, &right, &bottom); + + int x1, y1, x2, y2; + if (!it->Baseline(level, &x1, &y1, &x2, &y2)) { + return; + } + + json_str << ",\n \"baseline\": { " << "\"x0\": " << x1 << ", \"y0\": " << y1 << ", \"x1\": " << x2 << ", \"y1\": " << y2 << " }"; +} + +/** + * Make a JSON-formatted string with JSON from the internal + * data structures. + * page_number is 0-based but will appear in the output as 1-based. + * Image name/input_file_ can be set by SetInputName before calling + * GetJSONText + * STL removed from original patch submission and refactored by rays. + * Returned string must be freed with the delete [] operator. + */ +char *TessBaseAPI::GetJSONText(int page_number) { + return GetJSONText(nullptr, page_number); +} + +/** + * Make a JSON-formatted string with JSON from the internal + * data structures. + * page_number is 0-based but will appear in the output as 1-based. + * Image name/input_file_ can be set by SetInputName before calling + * GetJSONText + * STL removed from original patch submission and refactored by rays. + * Returned string must be freed with the delete [] operator. + */ +char* TessBaseAPI::GetJSONText(ETEXT_DESC* monitor, int page_number) { + if (tesseract_ == nullptr || + (page_res_ == nullptr && Recognize(monitor) < 0)) { + return nullptr; + } + + std::stringstream json_str; + json_str.imbue(std::locale::classic()); + json_str.precision(8); + json_str << "{\n \"page_id\": " << page_number + 1 << ",\n \"blocks\": ["; + + bool first_word = true; + bool first_block = true; + + std::unique_ptr res_it(GetIterator()); + while (!res_it->Empty(tesseract::RIL_BLOCK)) { + + if (res_it->Empty(RIL_WORD)) { + res_it->Next(RIL_WORD); + continue; + } + + if (res_it->IsAtBeginningOf(tesseract::RIL_BLOCK)) { + + // Skip non-text blocks. + // In addition to generally not being useful to the user, + // non-text blocks can cause major performance issues + // for some images where they greatly outnumber the text blocks. + if (!PTIsTextType(res_it->BlockType())) { + res_it->Next(tesseract::RIL_BLOCK); + continue; + } + + if (!first_block) json_str << ","; + first_block = false; + json_str << "\n {\n \"bbox\": "; + AddBoxToJson(res_it.get(), tesseract::RIL_BLOCK, json_str); + + if (recognition_done_) { + const std::unique_ptr grapheme( + res_it->GetUTF8Text(tesseract::RIL_BLOCK)); + json_str << ",\n \"text\": \"" << JsonEscape(grapheme.get()).c_str() << "\""; + json_str << ",\n \"confidence\": " + << static_cast(res_it->Confidence(tesseract::RIL_BLOCK)); + } else { + json_str << ",\n \"text\": null"; + json_str << ",\n \"confidence\": null"; + } + + json_str << ",\n \"blocktype\": " + << static_cast(res_it->BlockType()); + + json_str << ",\n \"paragraphs\": ["; + } + if (res_it->IsAtBeginningOf(tesseract::RIL_PARA)) { + + json_str << "\n {\n \"bbox\": "; + AddBoxToJson(res_it.get(), tesseract::RIL_PARA, json_str); + + if (recognition_done_) { + const std::unique_ptr grapheme( + res_it->GetUTF8Text(tesseract::RIL_PARA)); + json_str << ",\n \"text\": \"" << JsonEscape(grapheme.get()).c_str() << "\""; + json_str << ",\n \"confidence\": " + << static_cast(res_it->Confidence(tesseract::RIL_PARA)); + } else { + json_str << ",\n \"text\": null"; + json_str << ",\n \"confidence\": null"; + } + + json_str << ",\n \"is_ltr\": " + << static_cast(res_it->ParagraphIsLtr()); + + json_str << ",\n \"lines\": ["; + } + if (res_it->IsAtBeginningOf(tesseract::RIL_TEXTLINE)) { + + json_str << "\n {\n \"bbox\": "; + AddBoxToJson(res_it.get(), tesseract::RIL_TEXTLINE, json_str); + + if (recognition_done_) { + const std::unique_ptr grapheme( + res_it->GetUTF8Text(tesseract::RIL_TEXTLINE)); + json_str << ",\n \"text\": \"" << JsonEscape(grapheme.get()).c_str() << "\""; + json_str << ",\n \"confidence\": " + << static_cast(res_it->Confidence(tesseract::RIL_TEXTLINE)); + } else { + json_str << ",\n \"text\": null"; + json_str << ",\n \"confidence\": null"; + } + + float row_height, descenders, ascenders; + res_it->RowAttributes(&row_height, &descenders, &ascenders); + + json_str << ",\n \"rowAttributes\": {"; + json_str << "\n \"rowHeight\": " << row_height; + // Descenders is reported as a negative within Tesseract internally so we need to flip it. + // The positive version is intuitive, and matches what is reported in the hOCR output. + json_str << ",\n \"descenders\": " << -descenders; + json_str << ",\n \"ascenders\": " << ascenders; + json_str << "\n }"; + + AddBaselineCoordsToJson(res_it.get(), tesseract::RIL_TEXTLINE, json_str); + json_str << ",\n \"words\": ["; + first_word = true; + } + + bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); + bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); + bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); + + if (!first_word) json_str << ","; + json_str << "\n {\n \"bbox\": "; + AddBoxToJson(res_it.get(), tesseract::RIL_WORD, json_str); + + if (recognition_done_) { + const std::unique_ptr grapheme_word( + res_it->GetUTF8Text(tesseract::RIL_WORD)); + json_str << ",\n \"text\": \"" << JsonEscape(grapheme_word.get()).c_str() << "\","; + json_str << "\n \"confidence\": " << static_cast(res_it->Confidence(tesseract::RIL_WORD)); + } else { + json_str << ",\n \"text\": null,"; + json_str << "\n \"confidence\": null"; + } + + tesseract::WordChoiceIterator wc(*res_it); + int wc_cnt = 0; + json_str << ",\n \"choices\": ["; + do { + const char *choice = wc.GetUTF8Text(); + if (choice != nullptr) { + if (wc_cnt > 0) json_str << ","; + wc_cnt++; + json_str << "\n {\n"; + json_str << " \"text\": \"" << JsonEscape(choice).c_str() << "\","; + json_str << "\n \"confidence\": " << static_cast(wc.Confidence()); + json_str << "\n }"; + } + } while (recognition_done_ && wc.Next()); + if (wc_cnt > 0) { + json_str << "\n ]"; + } else { + json_str << "]"; + } + + bool bold, italic, underlined, monospace, serif, smallcaps; + int pointsize, font_id; + const char* font_name = + res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, + &serif, &smallcaps, &pointsize, &font_id); + json_str << ",\n \"font_name\": \"" << (font_name ? font_name : "") << "\""; + + // Add symbols array + // This needs to happen last, as it will advance the iterator to the next word. + json_str << ",\n \"symbols\": ["; + + bool first_char = true; + do { + if (!first_char) json_str << ","; + json_str << "\n"; + json_str << " {\n"; + json_str << " \"bbox\": "; + AddBoxToJson(res_it.get(), tesseract::RIL_SYMBOL, json_str); + + if (recognition_done_) { + const std::unique_ptr grapheme( + res_it->GetUTF8Text(tesseract::RIL_SYMBOL)); + json_str << ",\n \"text\": \"" << JsonEscape(grapheme.get()).c_str() << "\""; + json_str << ",\n \"confidence\": " + << static_cast(res_it->Confidence(tesseract::RIL_SYMBOL)); + } else { + json_str << ",\n \"text\": null"; + json_str << ",\n \"confidence\": null"; + } + + json_str << ",\n \"is_superscript\": " + << static_cast(res_it->SymbolIsSuperscript()); + json_str << ",\n \"is_subscript\": " + << static_cast(res_it->SymbolIsSubscript()); + json_str << ",\n \"is_dropcap\": " + << static_cast(res_it->SymbolIsDropcap()); + + json_str << "\n }"; + first_char = false; + + res_it->Next(tesseract::RIL_SYMBOL); + } while (!res_it->Empty(tesseract::RIL_BLOCK) && + !res_it->IsAtBeginningOf(tesseract::RIL_WORD)); + + json_str << "\n ]"; + json_str << "\n }"; + first_word = false; + + // Close any ending block/paragraph/textline. + if (last_word_in_line) { + json_str << "\n ]\n }"; + if (!last_word_in_para) { + json_str << ","; + } + } + if (last_word_in_para) { + json_str << "\n ]\n }"; + if (!last_word_in_block) { + json_str << ","; + } + } + if (last_word_in_block) { + json_str << "\n ]\n }"; + } + + } + + json_str << "\n ]\n}\n"; + + const std::string &text = json_str.str(); + char *result = new char[text.length() + 1]; + strcpy(result, text.c_str()); + return result; + +} + + +/********************************************************************** + * JSON Text Renderer interface implementation + **********************************************************************/ +TessJsonRenderer::TessJsonRenderer(const char *outputbase) + : TessResultRenderer(outputbase, "json") { +} + +bool TessJsonRenderer::BeginDocumentHandler() { + AppendString("{\n \"version\": \"" TESSERACT_VERSION_STR "\",\n"); + AppendString(" \"pages\": [\n"); + + return true; +} + +bool TessJsonRenderer::AddImageHandler(TessBaseAPI *api) { + const std::unique_ptr json(api->GetJSONText(imagenum())); + if (json == nullptr) { + return false; + } + + AppendString(json.get()); + + return true; +} + +bool TessJsonRenderer::EndDocumentHandler() { + AppendString(" ]\n}\n"); + + return true; +} + +} // namespace tesseract diff --git a/src/api/pagerenderer.cpp b/src/api/pagerenderer.cpp index cc88530b47..aee6709642 100644 --- a/src/api/pagerenderer.cpp +++ b/src/api/pagerenderer.cpp @@ -3,7 +3,7 @@ // Description: PAGE XML rendering interface // Author: Jan Kamlah -// (C) Copyright 2021 +// (C) Copyright 2024 // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -498,7 +498,7 @@ Pta *FitBaselineIntoLinePolygon(Pta *bottom_pts, Pta *baseline_pts, tesseract::W } num_pts = ptaGetCount(bottom_pts); - // Create a interpolated polygon with stepsize 1 + // Create an interpolated polygon with stepsize 1. for (int index = 0; index < num_pts - 1; ++index) { ptaGetIPt(bottom_pts, index, &x0, &y0); ptaGetIPt(bottom_pts, index + 1, &x1, &y1); @@ -648,7 +648,7 @@ bool TessPAGERenderer::AddImageHandler(TessBaseAPI *api) { "pagecontent.xsd\">\n" "\t if (std::regex_search(api->GetInputName(), std::regex("^(https?|ftp|ssh):"))) { @@ -717,7 +717,6 @@ char *TessBaseAPI::GetPAGEText(int page_number) { if (tesseract_->input_file_path_.empty()) { SetInputName(nullptr); } - // Used variables std::stringstream reading_order_str; @@ -772,7 +771,7 @@ char *TessBaseAPI::GetPAGEText(int page_number) { << "\" caption=\"Regions reading order\">\n"; std::unique_ptr res_it(GetIterator()); - + float block_conf = 0; float line_conf = 0; @@ -857,7 +856,7 @@ char *TessBaseAPI::GetPAGEText(int page_number) { // for now using LinePts bool skewed_flag = (orientation_block != ORIENTATION_PAGE_UP && orientation_block != ORIENTATION_PAGE_DOWN); - + if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { // writing_direction_before = writing_direction; line_conf = ((res_it->Confidence(RIL_TEXTLINE)) / 100.); diff --git a/src/arch/intsimdmatrix.h b/src/arch/intsimdmatrix.h index 06eba7572a..55507db96a 100644 --- a/src/arch/intsimdmatrix.h +++ b/src/arch/intsimdmatrix.h @@ -115,6 +115,8 @@ struct TESS_API IntSimdMatrix { static const IntSimdMatrix *intSimdMatrix; // Only available with NEON. static const IntSimdMatrix *intSimdMatrixNEON; + // Only available with RVV. + static const IntSimdMatrix intSimdMatrixRVV; // Only available with AVX2 / AVX / FMA / SSE. static const IntSimdMatrix *intSimdMatrixAVX2; static const IntSimdMatrix *intSimdMatrixAVX512VNNI; diff --git a/src/arch/intsimdmatrixrvv.cpp b/src/arch/intsimdmatrixrvv.cpp new file mode 100644 index 0000000000..cd0ee68098 --- /dev/null +++ b/src/arch/intsimdmatrixrvv.cpp @@ -0,0 +1,88 @@ +/////////////////////////////////////////////////////////////////////// +// File: intsimdmatrixrvv.cpp +// Description: matrix-vector product for 8-bit data on rvv. +// Author: sunyuechi +// +// Copyright (c) 2024 Institute of Software Chinese Academy of Sciences (ISCAS). +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifdef HAVE_CONFIG_H +# include "config_auto.h" // for HAVE_RVV, ... +#endif + +#if HAVE_RVV +# include "intsimdmatrix.h" +# include "tesstypes.h" + +namespace tesseract { + +static int DotProduct(const int8_t *u, const int8_t *v, int num) { + int total = 0; + + asm __volatile__ ( + " .option arch, +v \n\t" + " vsetvli t0,zero,e32,m8,ta,ma \n\t" + " vmv.v.i v0,0 \n\t" + "1: \n\t" + " vsetvli t0,%[num],e8,m2,ta,ma \n\t" + " vle8.v v16,0(%[u]) \n\t" + " vle8.v v24,0(%[v]) \n\t" + " sub %[num],%[num],t0 \n\t" + " vwmul.vv v8,v24,v16 \n\t" + " add %[u],%[u],t0 \n\t" + " add %[v],%[v],t0 \n\t" + " vsetvli zero,zero,e16,m4,tu,ma \n\t" + " vwadd.wv v0,v0,v8 \n\t" + " bnez %[num],1b \n\t" + " vsetvli t0,zero,e32,m8,ta,ma \n\t" + " vmv.s.x v8,zero \n\t" + " vredsum.vs v0,v0,v8 \n\t" + " vmv.x.s %[total],v0 \n\t" + : [u] "+r" (u), + [v] "+r" (v), + [num] "+r" (num), + [total] "+r" (total) + : + : "cc", "memory" + ); + + return total; +} + +static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const TFloat *scales, + const int8_t *u, TFloat *v) { + int num_out = dim1; + int num_in = dim2 - 1; + for (int i = 0; i < num_out; ++i) { + const int8_t *wi_start = wi + i * dim2; + int total = DotProduct(wi_start, u, num_in); + // Add in the bias and apply scaling. + v[i] = (total + wi_start[num_in] * INT8_MAX) * scales[i]; + } +} + +const IntSimdMatrix IntSimdMatrix::intSimdMatrixRVV = { + // Function. + matrixDotVector, + // Number of 32 bit outputs held in each register. + 1, + // Maximum number of registers that we will use to hold outputs. + 1, + // Number of 8 bit inputs in the inputs register. + 1, + // Number of inputs in each weight group. + 1 +}; + +} // namespace tesseract. + +#endif /* HAVE_RVV */ diff --git a/src/arch/simddetect.cpp b/src/arch/simddetect.cpp index e783e4d859..c7827462ee 100644 --- a/src/arch/simddetect.cpp +++ b/src/arch/simddetect.cpp @@ -80,6 +80,12 @@ # endif #endif +#if defined(HAVE_RVV) +# if defined(HAVE_GETAUXVAL) +# include +# define HWCAP_RV(letter) (1ul << ((letter) - 'A')) +# endif +#endif namespace tesseract { @@ -107,6 +113,8 @@ bool SIMDDetect::neon_available_ = true; #elif defined(HAVE_NEON) // If true, then Neon has been detected. bool SIMDDetect::neon_available_ = true; +#elif defined(HAVE_RVV) +bool SIMDDetect::rvv_available_ = false; #else // If true, then Neon has been detected. bool SIMDDetect::neon_available_ = false; @@ -262,6 +270,13 @@ SIMDDetect::SIMDDetect() { elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap); neon_available_ = hwcap & HWCAP_NEON; # endif +#endif + +#if defined(HAVE_RVV) +# if defined(HAVE_GETAUXVAL) + const unsigned long hwcap = getauxval(AT_HWCAP); + rvv_available_ = !!(hwcap & HWCAP_RV('V')); +# endif #endif // Select code for calculation of dot product based on autodetection. @@ -294,6 +309,10 @@ SIMDDetect::SIMDDetect() { // NEON detected. SetDotProduct(DotProductNEON, IntSimdMatrix::intSimdMatrixNEON); dotproduct_method = "neon"; +#if defined(HAVE_RVV) + } else if (rvv_available_) { + SetDotProduct(DotProductGeneric, &IntSimdMatrix::intSimdMatrixRVV); +#endif #if defined(HAVE_FRAMEWORK_ACCELERATE) } else { SetDotProduct(DotProductAccelerate); @@ -376,7 +395,8 @@ void SIMDDetect::Update() { (avx_available_ && IntSimdMatrix::intSimdMatrixSSE != nullptr) ? " avx" : "", (fma_available_ && IntSimdMatrix::intSimdMatrixSSE != nullptr) ? " fma" : "", (sse_available_ && IntSimdMatrix::intSimdMatrixSSE != nullptr) ? " sse" : "", - (neon_available_ && IntSimdMatrix::intSimdMatrixNEON != nullptr) ? " neon" : ""); + (neon_available_ && IntSimdMatrix::intSimdMatrixNEON != nullptr) ? " neon" : "" + ); } dotproduct.set_value(dotproduct_method); diff --git a/src/arch/simddetect.h b/src/arch/simddetect.h index fcb0f53eca..5d4eb33880 100644 --- a/src/arch/simddetect.h +++ b/src/arch/simddetect.h @@ -63,6 +63,10 @@ class SIMDDetect { static inline bool IsNEONAvailable() { return detector.neon_available_; } + // Returns true if RVV is available on this system. + static inline bool IsRVVAvailable() { + return detector.rvv_available_; + } // Update settings after config variable was set. static TESS_API void Update(); @@ -86,6 +90,8 @@ class SIMDDetect { static TESS_API bool sse_available_; // If true, then NEON has been detected. static TESS_API bool neon_available_; + // If true, then RVV has been detected. + static TESS_API bool rvv_available_; }; } // namespace tesseract diff --git a/src/ccmain/applybox.cpp b/src/ccmain/applybox.cpp index 37e6f3d7a9..a97fac3bfa 100644 --- a/src/ccmain/applybox.cpp +++ b/src/ccmain/applybox.cpp @@ -28,6 +28,7 @@ #include #include "pageres.h" #include "tesseractclass.h" +#include "tesserrstream.h" // for tesserr #include "unicharset.h" #if !DISABLED_LEGACY_ENGINE diff --git a/src/ccmain/control.cpp b/src/ccmain/control.cpp index bb1871d34e..892304c6ed 100644 --- a/src/ccmain/control.cpp +++ b/src/ccmain/control.cpp @@ -45,25 +45,28 @@ #endif #include "sorthelper.h" #include "tesseractclass.h" +#include "tesserrstream.h" // for tesserr #include "tessvars.h" #include "werdit.h" #include "global_params.h" #include "pixProcessing.h" const char *const kBackUpConfigFile = "tempconfigdata.config"; + #if !DISABLED_LEGACY_ENGINE // Min believable x-height for any text when refitting as a fraction of // original x-height const double kMinRefitXHeightFraction = 0.5; #endif // !DISABLED_LEGACY_ENGINE +namespace tesseract { + /** * Make a word from the selected blobs and run Tess on them. * * @param page_res recognise blobs * @param selection_box within this box */ -namespace tesseract { void Tesseract::recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box) { PAGE_RES_IT *it = make_pseudo_word(page_res, selection_box); @@ -1602,7 +1605,10 @@ void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordD // Points to the best result. May be word or in lang_words. const WERD_RES *word = word_data->word; plf::nanotimer clock; - clock.start(); + const bool timing_debug = tessedit_timing_debug; + if (timing_debug) { + clock.start(); + } const bool debug = (classify_debug_level > 0 || multilang_debug_level > 0); if (debug) { TBOX bbox = word->word->bounding_box(); @@ -1653,11 +1659,12 @@ void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordD } else { tprintWarn("no best words!!\n"); } - if (tessedit_timing_debug) { - tprintDebug("classify_word_and_language -> word best choice: {} (bbox: {}, OCR took {} sec)\n", + if (timing_debug) { + auto total_time = clock.get_elapsed_ms(); + tprintDebug("classify_word_and_language -> word best choice: {} (bbox: {}, OCR took {} ms)\n", mdqstr(word_data->word->best_choice->unichar_string()), word_data->word->word->bounding_box().print_to_str(), - clock.get_elapsed_sec()); + total_time); } } diff --git a/src/ccmain/fixspace.cpp b/src/ccmain/fixspace.cpp index b90963b597..56786ee77d 100644 --- a/src/ccmain/fixspace.cpp +++ b/src/ccmain/fixspace.cpp @@ -57,12 +57,9 @@ class ROW; **********************************************************************/ static int c_blob_comparator( // sort blobs - const void *blob1p, // ptr to ptr to blob1 - const void *blob2p // ptr to ptr to blob2 + const C_BLOB *blob1, + const C_BLOB *blob2 ) { - const C_BLOB *blob1 = *reinterpret_cast(blob1p); - const C_BLOB *blob2 = *reinterpret_cast(blob2p); - return blob1->bounding_box().left() - blob2->bounding_box().left(); } diff --git a/src/ccmain/ltrresultiterator.cpp b/src/ccmain/ltrresultiterator.cpp index 7c4eb3a6f3..30424fec1c 100644 --- a/src/ccmain/ltrresultiterator.cpp +++ b/src/ccmain/ltrresultiterator.cpp @@ -101,13 +101,11 @@ float LTRResultIterator::Confidence(PageIteratorLevel level) const { float mean_certainty = 0.0f; int certainty_count = 0; PAGE_RES_IT res_it(*it_); - WERD_CHOICE *best_choice = res_it.word()->best_choice; - ASSERT_HOST(best_choice != nullptr); + WERD_CHOICE *best_choice; switch (level) { case RIL_BLOCK: do { best_choice = res_it.word()->best_choice; - ASSERT_HOST(best_choice != nullptr); mean_certainty += best_choice->certainty(); ++certainty_count; res_it.forward(); @@ -116,7 +114,6 @@ float LTRResultIterator::Confidence(PageIteratorLevel level) const { case RIL_PARA: do { best_choice = res_it.word()->best_choice; - ASSERT_HOST(best_choice != nullptr); mean_certainty += best_choice->certainty(); ++certainty_count; res_it.forward(); @@ -126,19 +123,24 @@ float LTRResultIterator::Confidence(PageIteratorLevel level) const { case RIL_TEXTLINE: do { best_choice = res_it.word()->best_choice; - ASSERT_HOST(best_choice != nullptr); mean_certainty += best_choice->certainty(); ++certainty_count; res_it.forward(); } while (res_it.row() == res_it.prev_row()); break; case RIL_WORD: - mean_certainty += best_choice->certainty(); - ++certainty_count; + best_choice = res_it.word()->best_choice; + mean_certainty = best_choice->certainty(); + certainty_count = 1; break; case RIL_SYMBOL: - mean_certainty += best_choice->certainty(blob_index_); - ++certainty_count; + best_choice = res_it.word()->best_choice; + mean_certainty = best_choice->certainty(blob_index_); + certainty_count = 1; + break; + default: + ASSERT_HOST_MSG(false, "Should never get here."); + break; } if (certainty_count > 0) { mean_certainty /= certainty_count; @@ -322,7 +324,6 @@ char *LTRResultIterator::WordNormedUTF8Text() const { std::string ocr_text; WERD_CHOICE *best_choice = it_->word()->best_choice; const UNICHARSET *unicharset = it_->word()->uch_set; - ASSERT_HOST(best_choice != nullptr); for (unsigned i = 0; i < best_choice->length(); ++i) { ocr_text += unicharset->get_normed_unichar(best_choice->unichar_id(i)); } diff --git a/src/ccmain/pagesegmain.cpp b/src/ccmain/pagesegmain.cpp index 4ef42494d5..d7de770c14 100644 --- a/src/ccmain/pagesegmain.cpp +++ b/src/ccmain/pagesegmain.cpp @@ -104,8 +104,10 @@ int Tesseract::SegmentPage(const char *input_file, BLOCK_LIST *blocks, Tesseract std::string name; if (input_file != nullptr && input_file[0] != '\0') { name = input_file; - std::size_t lastdot = name.find_last_of("."); - name = name.substr(0, lastdot); + auto lastdot = name.find_last_of('.'); + if (lastdot != std::string::npos) { + name.resize(lastdot); + } } if (!PSM_COL_FIND_ENABLED(pageseg_mode) && !name.empty()) { read_unlv_file(name, width, height, blocks); diff --git a/src/ccmain/paragraphs.cpp b/src/ccmain/paragraphs.cpp index fbba78aee9..8ea118c256 100644 --- a/src/ccmain/paragraphs.cpp +++ b/src/ccmain/paragraphs.cpp @@ -33,6 +33,7 @@ #include "ratngs.h" // for WERD_CHOICE #include "rect.h" // for TBOX #include "statistc.h" // for STATS +#include "tesserrstream.h" // for tesserr #include // for tprintf #include "unicharset.h" // for UNICHARSET #include "werd.h" // for WERD, W_REP_CHAR diff --git a/src/ccmain/paramsd.cpp b/src/ccmain/paramsd.cpp index 7259cb3be7..2c67a3803b 100644 --- a/src/ccmain/paramsd.cpp +++ b/src/ccmain/paramsd.cpp @@ -123,9 +123,7 @@ static void GetPrefixes(const char *s, std::string &level_one, std::string &leve } // Compare two VC objects by their name. -int ParamContent::Compare(const void *v1, const void *v2) { - const ParamContent *one = *static_cast(v1); - const ParamContent *two = *static_cast(v2); +int ParamContent::Compare(const ParamContent *one, const ParamContent *two) { return strcmp(one->GetName(), two->GetName()); } diff --git a/src/ccmain/paramsd.h b/src/ccmain/paramsd.h index 62bbe862cc..10e5b6fc08 100644 --- a/src/ccmain/paramsd.h +++ b/src/ccmain/paramsd.h @@ -37,10 +37,10 @@ class Tesseract; // comparison or getting its value. It is used in the context of the // ParamsEditor as a bridge from the internal tesseract parameters to the // ones displayed by the ScrollView server. -class ParamContent : public ELIST_LINK { +class ParamContent : public ELIST::LINK { public: // Compare two VC objects by their name. - static int Compare(const void *v1, const void *v2); + static int Compare(const ParamContent *v1, const ParamContent *v2); // Gets a VC object identified by its ID. static ParamContent *GetParamContentById(int id); diff --git a/src/ccmain/tessedit.cpp b/src/ccmain/tessedit.cpp index a174461b57..ebccc50032 100644 --- a/src/ccmain/tessedit.cpp +++ b/src/ccmain/tessedit.cpp @@ -28,6 +28,7 @@ #include #include "stopper.h" #include "tesseractclass.h" +#include "tesserrstream.h" // for tesserr #include "tessvars.h" #include #if !DISABLED_LEGACY_ENGINE @@ -48,38 +49,28 @@ void Tesseract::read_config_file(const char *filename) { return; } - std::string path = datadir_; - path += "configs/"; - path += filename; - tprintDebug("Read Config: test if '{}' is a readable file: ", path); - FILE *fp; - if ((fp = fopen(path.c_str(), "rb")) != nullptr) { - fclose(fp); - } else { - path = datadir_; - path += "tessconfigs/"; - path += filename; - tprintDebug("NO.\n" - "Read Config: test if '{}' is a readable file: ", path); - if ((fp = fopen(path.c_str(), "rb")) != nullptr) { - fclose(fp); - } else { - path = filename; - tprintDebug("NO.\n" - "Read Config: test if '{}' is a readable file: ", path); - if ((fp = fopen(path.c_str(), "rb")) != nullptr) { - fclose(fp); - } - else { - tprintDebug("NO.\n"); + // Construct potential config file paths + std::vector config_paths = { + datadir / "configs" / filename, + datadir / "tessconfigs" / filename, + std::filesystem::path(filename)}; + + // Use the first existing file or fallback to the last (filename) + auto config_file = std::find_if(config_paths.begin(), config_paths.end(), + [](const std::filesystem::path &path) { + std::error_code ec; + tprintDebug("Read Config: test if '{}' is a readable file: ", path); + auto rv = std::filesystem::exists(path, ec); + tprintDebug("{}.\n", rv ? "YES" : "NO"); + return rv; + }); + if (config_file == config_paths.end()) { tprintError("Config file '{}' cannot be opened / does not exist anywhere we looked.\n", filename); return; } - } - } - tprintDebug("YES\n"); + const std::filesystem::path &selected_path = *config_file; - ParamUtils::ReadParamsFile(path, this->params_collective(), nullptr, PARAM_VALUE_IS_SET_BY_CONFIGFILE); + ParamUtils::ReadParamsFile(selected_path.string().c_str(), this->params_collective(), nullptr, PARAM_VALUE_IS_SET_BY_CONFIGFILE); } bool Tesseract::InitParameters(const std::vector &vars_vec, @@ -136,12 +127,11 @@ bool Tesseract::init_tesseract_lang_data(const std::string &arg0, TessdataManager *mgr) { // Set the language data path prefix lang_ = !language.empty() ? language : "eng"; - language_data_path_prefix_ = datadir_; - language_data_path_prefix_ += lang_; - language_data_path_prefix_ += "."; + //std::filesystem::path + language_data_path_prefix_ = datadir_ / (lang + "."); // Initialize TessdataManager. - std::string tessdata_path = language_data_path_prefix_ + kTrainedDataSuffix; + std::filesystem::path tessdata_path = language_data_path_prefix_ + kTrainedDataSuffix; if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) { tprintError("Error opening data file {}\n", tessdata_path); tprintInfo( @@ -208,7 +198,7 @@ bool Tesseract::init_tesseract_lang_data(const std::string &arg0, tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) { #endif // DISABLED_LEGACY_ENGINE if (mgr->IsComponentAvailable(TESSDATA_LSTM)) { - lstm_recognizer_ = new LSTMRecognizer(this); + lstm_recognizer_ = new LSTMRecognizer(*this); ResyncVariablesInternally(); // lstm_recognizer_->SetDataPathPrefix(language_data_path_prefix); diff --git a/src/ccmain/thresholder.cpp b/src/ccmain/thresholder.cpp index 144a2f0074..6c6f21b13c 100644 --- a/src/ccmain/thresholder.cpp +++ b/src/ccmain/thresholder.cpp @@ -162,7 +162,6 @@ void ImageThresholder::GetImageSizes(int *left, int *top, int *width, int *heigh // immediately after, but may not go away until after the Thresholder has // finished with it. void ImageThresholder::SetImage(const Image &pix, int exif, const float angle, bool upscale) { -#if 01 // Note that pix.clone() does not actually clone the data, // it simply makes a new pointer to the existing data. // Therefore, there should not be any performance penalty @@ -170,14 +169,15 @@ void ImageThresholder::SetImage(const Image &pix, int exif, const float angle, b // Rotate if specified by exif orientation value. Image src, temp1, temp2, temp3; + src = pix; if (exif == 3 || exif == 4) { - temp1 = pixRotateOrth(const_cast(pix.ptr()), 2); + temp1 = pixRotateOrth(src, 2); } else if (exif == 5 || exif == 6) { - temp1 = pixRotateOrth(const_cast(pix.ptr()), 1); + temp1 = pixRotateOrth(src, 1); } else if (exif == 7 || exif == 8) { - temp1 = pixRotateOrth(const_cast(pix.ptr()), 3); + temp1 = pixRotateOrth(src, 3); } else { - temp1 = pix; + temp1 = src; } // Mirror if specified by exif orientation value @@ -200,9 +200,7 @@ void ImageThresholder::SetImage(const Image &pix, int exif, const float angle, b // // clones or creates a freshly rotated copy. Image src = pixRotate(temp3, angle, L_ROTATE_AREA_MAP, L_BRING_IN_WHITE, 0, 0); -#else - Image src = pix; // clones -#endif + int depth; pixGetDimensions(src, &image_width_, &image_height_, &depth); // Convert the image as necessary so it is one of binary, plain RGB, or @@ -283,8 +281,8 @@ std::tuple ImageThresholder::Threshold(ThresholdMetho pix_grey = GetPixRectGrey(); r = pixSauvolaBinarizeTiled(pix_grey, half_window_size, kfactor, nx, ny, - (PIX**)pix_thresholds, - (PIX**)pix_binary); + pix_thresholds, + pix_binary); } break; case ThresholdMethod::OtsuOnNormalizedBackground: { @@ -326,8 +324,8 @@ std::tuple ImageThresholder::Threshold(ThresholdMetho r = pixOtsuAdaptiveThreshold(pix_grey, tile_size, tile_size, half_smooth_size, half_smooth_size, score_fraction, - (PIX **)pix_thresholds, - (PIX **)pix_binary); + pix_thresholds, + pix_binary); } break; case ThresholdMethod::Nlbin: { @@ -383,25 +381,28 @@ bool ImageThresholder::ThresholdToPix(Image *pix) { Image original = GetPixRect(); + // Handle binary image if (pix_channels_ == 0) { // We have a binary image, but it still has to be copied, as this API // allows the caller to modify the output. *pix = original.copy(); + original.destroy(); + return true; + } + + // Handle colormaps + Image src; + if (pixGetColormap(original)) { + src = pixRemoveColormap(original, REMOVE_CMAP_BASED_ON_SRC); + int depth = pixGetDepth(src); + if (depth > 1 && depth < 8) { + src = pixConvertTo8(src, false); + } } else { - if (pixGetColormap(original)) { - Image tmp; - Image without_cmap = pixRemoveColormap(original, REMOVE_CMAP_BASED_ON_SRC); - int depth = pixGetDepth(without_cmap); - if (depth > 1 && depth < 8) { - tmp = pixConvertTo8(without_cmap, false); - } else { - tmp = without_cmap.copy(); - } - OtsuThresholdRectToPix(tmp, pix); - } else { - OtsuThresholdRectToPix(pix_, pix); - } + src = original; } + OtsuThresholdRectToPix(src, pix); + return true; } @@ -475,7 +476,7 @@ Image ImageThresholder::GetPixRectGrey() { } // Otsu thresholds the rectangle, taking the rectangle from *this. -void ImageThresholder::OtsuThresholdRectToPix(Image src_pix, Image *out_pix) const { +void ImageThresholder::OtsuThresholdRectToPix(const Image &src_pix, Image *out_pix) const { std::vector thresholds; std::vector hi_values; @@ -489,13 +490,13 @@ void ImageThresholder::OtsuThresholdRectToPix(Image src_pix, Image *out_pix) con /// /// NOTE that num_channels is the size of the thresholds and hi_values /// arrays and also the bytes per pixel in src_pix. -void ImageThresholder::ThresholdRectToPix(Image src_pix, int num_channels, const std::vector &thresholds, +void ImageThresholder::ThresholdRectToPix(const Image &src_pix, int num_channels, const std::vector &thresholds, const std::vector &hi_values, Image *pix) const { *pix = pixCreate(rect_width_, rect_height_, 1); uint32_t *pixdata = pixGetData(*pix); int wpl = pixGetWpl(*pix); int src_wpl = pixGetWpl(src_pix); - uint32_t *srcdata = pixGetData(src_pix); + const uint32_t *srcdata = pixGetData(src_pix); pixSetXRes(*pix, pixGetXRes(src_pix)); pixSetYRes(*pix, pixGetYRes(src_pix)); for (int y = 0; y < rect_height_; ++y) { diff --git a/src/ccmain/thresholder.h b/src/ccmain/thresholder.h index 8e463aab91..981c9f6aee 100644 --- a/src/ccmain/thresholder.h +++ b/src/ccmain/thresholder.h @@ -208,13 +208,13 @@ class TESS_API ImageThresholder { } // Otsu thresholds the rectangle, taking the rectangle from *this. - void OtsuThresholdRectToPix(Image src_pix, Image *out_pix) const; + void OtsuThresholdRectToPix(const Image &src_pix, Image *out_pix) const; /// Threshold the rectangle, taking everything except the src_pix /// from the class, using thresholds/hi_values to the output pix. /// NOTE that num_channels is the size of the thresholds and hi_values // arrays and also the bytes per pixel in src_pix. - void ThresholdRectToPix(Image src_pix, int num_channels, const std::vector &thresholds, + void ThresholdRectToPix(const Image &src_pix, int num_channels, const std::vector &thresholds, const std::vector &hi_values, Image *pix) const; private: diff --git a/src/ccstruct/blamer.cpp b/src/ccstruct/blamer.cpp index 89dc9550c3..baac81f712 100644 --- a/src/ccstruct/blamer.cpp +++ b/src/ccstruct/blamer.cpp @@ -94,7 +94,7 @@ void BlamerBundle::SetSymbolTruth(const UNICHARSET &unicharset, const char *char if (id != INVALID_UNICHAR_ID) { std::string normed_uch(unicharset.get_normed_unichar(id)); if (normed_uch.length() > 0) { - symbol_str = normed_uch; + symbol_str = std::move(normed_uch); } } int length = truth_word_.length(); diff --git a/src/ccstruct/blobbox.cpp b/src/ccstruct/blobbox.cpp index 7cd67a6f15..6043f16607 100644 --- a/src/ccstruct/blobbox.cpp +++ b/src/ccstruct/blobbox.cpp @@ -902,7 +902,7 @@ void vertical_cblob_projection( // project outlines /********************************************************************** * vertical_coutline_projection * - * Compute the vertical projection of a outline from its outlines + * Compute the vertical projection of an outline from its outlines * and add to the given STATS. **********************************************************************/ diff --git a/src/ccstruct/blobbox.h b/src/ccstruct/blobbox.h index 9fe3d7cbf0..6c4a155a38 100644 --- a/src/ccstruct/blobbox.h +++ b/src/ccstruct/blobbox.h @@ -156,7 +156,7 @@ class BLOBNBOX; ELISTIZEH(BLOBNBOX); -class BLOBNBOX : public ELIST_LINK { +class BLOBNBOX : public ELIST::LINK { public: BLOBNBOX() { ReInit(); @@ -586,7 +586,7 @@ class BLOBNBOX : public ELIST_LINK { bool owns_cblob_ = false; }; -class TO_ROW : public ELIST2_LINK { +class TO_ROW : public ELIST2::LINK { public: static const int kErrorWeight = 3; @@ -730,7 +730,7 @@ class TO_ROW : public ELIST2_LINK { // warning C4946: reinterpret_cast used between related classes: 'tesseract::ELIST2_LINK' and 'tesseract::TO_ROW' ELIST2IZEH(TO_ROW); -class TESS_API TO_BLOCK : public ELIST_LINK { +class TESS_API TO_BLOCK : public ELIST::LINK { public: TO_BLOCK() : pitch_decision(PITCH_DUNNO) { clear(); diff --git a/src/ccstruct/coutln.h b/src/ccstruct/coutln.h index 22034bb3d7..150caafea3 100644 --- a/src/ccstruct/coutln.h +++ b/src/ccstruct/coutln.h @@ -75,7 +75,7 @@ class C_OUTLINE; // forward declaration ELISTIZEH(C_OUTLINE); -class C_OUTLINE : public ELIST_LINK { +class C_OUTLINE : public ELIST::LINK { public: C_OUTLINE() { stepcount = 0; diff --git a/src/ccstruct/detlinefit.cpp b/src/ccstruct/detlinefit.cpp index 632cf71d4c..6acc574580 100644 --- a/src/ccstruct/detlinefit.cpp +++ b/src/ccstruct/detlinefit.cpp @@ -19,9 +19,10 @@ #include // compiler config, etc. #include "detlinefit.h" -#include "helpers.h" // for IntCastRounded +#include "helpers.h" // for IntCastRounded #include "statistc.h" #include "baselinedetect.h" +#include "tesserrstream.h" // for tesserr #include #include diff --git a/src/ccstruct/imagedata.cpp b/src/ccstruct/imagedata.cpp index 4fffdafe95..37411df987 100644 --- a/src/ccstruct/imagedata.cpp +++ b/src/ccstruct/imagedata.cpp @@ -25,6 +25,7 @@ #include "rect.h" // for TBOX #include "scrollview.h" // for ScrollView, Diagnostics::CYAN, Diagnostics::NONE #include // for tprintf +#include "tesserrstream.h" // for tesserr #include "helpers.h" // for IntCastRounded, TRand, ClipToRange, Modulo #include "serialis.h" // for TFile @@ -524,7 +525,8 @@ void DocumentData::Shuffle() { TRand random; // Different documents get shuffled differently, but the same for the same // name. - random.set_seed(document_name_); + std::hash hasher; + random.set_seed(static_cast(hasher(document_name_))); int num_pages = pages_.size(); // Execute one random swap for each page in the document. for (int i = 0; i < num_pages; ++i) { @@ -546,10 +548,11 @@ bool DocumentData::ReCachePages() { delete page; } pages_.clear(); -#if !defined(TESSERACT_IMAGEDATA_AS_PIX) +//#if !defined(TESSERACT_IMAGEDATA_AS_PIX) + auto name_size = document_name_.size(); if (document_name_.ends_with(".png")) { // PNG image given instead of LSTMF file. - std::string gt_name = document_name_.substr(0, document_name_.length() - 3) + "gt.txt"; + std::string gt_name{document_name_.substr(0, name_size - 3) + "gt.txt"}; std::ifstream t(gt_name); std::string line; std::getline(t, line); @@ -562,18 +565,23 @@ bool DocumentData::ReCachePages() { pages_offset_ %= loaded_pages; set_total_pages(loaded_pages); set_memory_used(memory_used() + image_data->MemoryUsed()); -#if 01 - tprintDebug("Loaded {}/{} lines ({}-{}) of document {}\n", pages_.size(), - loaded_pages, pages_offset_ + 1, pages_offset_ + pages_.size(), - document_name_); -#endif + tprintDebug("Loaded {}/{} lines ({}-{}) of document {}\n", pages_.size(), + loaded_pages, pages_offset_ + 1, pages_offset_ + pages_.size(), + document_name_.c_str()); return !pages_.empty(); } -#endif +//#endif TFile fp; - if (!fp.Open(document_name_.c_str(), reader_) || - !fp.DeSerializeSize(&loaded_pages) || loaded_pages <= 0) { - tprintError("Deserialize header failed: {}\n", document_name_); + if (!fp.Open(document_name_.c_str(), reader_)) { + tprintError("Deserialize failed: cannot open file: {}\n", document_name_); + return false; + } + if (!fp.DeSerializeSize(&loaded_pages)) { + tprintError("Deserialize header size failed for file: {}\n", document_name_); + return false; + } + if (loaded_pages <= 0) { + tprintError("Deserialize header produced faulty page count {} for file: {}\n", loaded_pages, document_name_); return false; } pages_offset_ %= loaded_pages; @@ -608,7 +616,7 @@ bool DocumentData::ReCachePages() { } } if (page < loaded_pages) { - tprintError("Deserialize failed: {} read {}/{} lines\n", document_name_, + tprintError("Deserialize failed for file: {}; read {}/{} lines\n", document_name_, page, loaded_pages); for (auto page : pages_) { delete page; diff --git a/src/ccstruct/ocrblock.cpp b/src/ccstruct/ocrblock.cpp index a7bbc0959c..974ea3aaca 100644 --- a/src/ccstruct/ocrblock.cpp +++ b/src/ccstruct/ocrblock.cpp @@ -70,9 +70,9 @@ BLOCK::BLOCK(const char *name, ///< filename * Sort Comparator: Return <0 if row1 top < row2 top */ -static int decreasing_top_order(const void *row1, const void *row2) { - return (*reinterpret_cast(row2))->bounding_box().top() - - (*reinterpret_cast(row1))->bounding_box().top(); +static int decreasing_top_order(const ROW *row1, const ROW *row2) { + return row2->bounding_box().top() - + row1->bounding_box().top(); } /** @@ -225,7 +225,7 @@ BLOCK &BLOCK::operator=( // assignment const BLOCK &source // from this ) { if (this != &source) { - this->ELIST_LINK::operator=(source); + this->ELIST::LINK::operator=(source); pdblk = source.pdblk; proportional = source.proportional; kerning = source.kerning; diff --git a/src/ccstruct/ocrblock.h b/src/ccstruct/ocrblock.h index 343584b9e4..a13a98367b 100644 --- a/src/ccstruct/ocrblock.h +++ b/src/ccstruct/ocrblock.h @@ -30,7 +30,7 @@ class BLOCK; // forward decl ELISTIZEH(BLOCK); -class TESS_API BLOCK : public ELIST_LINK +class TESS_API BLOCK : public ELIST::LINK // page block { friend class BLOCK_RECT_IT; // block iterator diff --git a/src/ccstruct/ocrpara.h b/src/ccstruct/ocrpara.h index df34e49305..81ded2e261 100644 --- a/src/ccstruct/ocrpara.h +++ b/src/ccstruct/ocrpara.h @@ -27,7 +27,7 @@ namespace tesseract { class ParagraphModel; -struct PARA : public ELIST_LINK { +struct PARA : public ELIST::LINK { public: PARA() : model(nullptr) diff --git a/src/ccstruct/ocrrow.cpp b/src/ccstruct/ocrrow.cpp index 1187048a56..b4d1070cc2 100644 --- a/src/ccstruct/ocrrow.cpp +++ b/src/ccstruct/ocrrow.cpp @@ -222,7 +222,7 @@ void ROW::plot( // draw it **********************************************************************/ ROW &ROW::operator=(const ROW &source) { - this->ELIST_LINK::operator=(source); + this->ELIST::LINK::operator=(source); kerning = source.kerning; spacing = source.spacing; xheight = source.xheight; diff --git a/src/ccstruct/ocrrow.h b/src/ccstruct/ocrrow.h index 3228e7e1af..f73e4a9104 100644 --- a/src/ccstruct/ocrrow.h +++ b/src/ccstruct/ocrrow.h @@ -36,7 +36,7 @@ class TO_ROW; struct PARA; -class ROW : public ELIST_LINK { +class ROW : public ELIST::LINK { friend void tweak_row_baseline(ROW *, double, double); public: diff --git a/src/ccstruct/otsuthr.cpp b/src/ccstruct/otsuthr.cpp index 2bd0fde0d9..415dbd498a 100644 --- a/src/ccstruct/otsuthr.cpp +++ b/src/ccstruct/otsuthr.cpp @@ -34,7 +34,7 @@ namespace tesseract { // that there is no apparent foreground. At least one hi_value will not be -1. // The return value is the number of channels in the input image, being // the size of the output thresholds and hi_values arrays. -int OtsuThreshold(Image src_pix, int left, int top, int width, int height, std::vector &thresholds, +int OtsuThreshold(const Image &src_pix, int left, int top, int width, int height, std::vector &thresholds, std::vector &hi_values) { int num_channels = pixGetDepth(src_pix) / 8; // Of all channels with no good hi_value, keep the best so we can always @@ -92,7 +92,7 @@ int OtsuThreshold(Image src_pix, int left, int top, int width, int height, std:: // single channel. Each channel is always one byte per pixel. // Histogram is always a kHistogramSize(256) element array to count // occurrences of each pixel value. -void HistogramRect(Image src_pix, int channel, int left, int top, int width, int height, +void HistogramRect(const Image &src_pix, int channel, int left, int top, int width, int height, int *histogram) { int num_channels = pixGetDepth(src_pix) / 8; channel = ClipToRange(channel, 0, num_channels - 1); diff --git a/src/ccstruct/otsuthr.h b/src/ccstruct/otsuthr.h index 20484863c9..09fbb52c77 100644 --- a/src/ccstruct/otsuthr.h +++ b/src/ccstruct/otsuthr.h @@ -37,7 +37,7 @@ const int kHistogramSize = 256; // The size of a histogram of pixel values. // that there is no apparent foreground. At least one hi_value will not be -1. // The return value is the number of channels in the input image, being // the size of the output thresholds and hi_values arrays. -int OtsuThreshold(Image src_pix, int left, int top, int width, int height, +int OtsuThreshold(const Image &src_pix, int left, int top, int width, int height, std::vector &thresholds, std::vector &hi_values); @@ -45,7 +45,7 @@ int OtsuThreshold(Image src_pix, int left, int top, int width, int height, // single channel. Each channel is always one byte per pixel. // Histogram is always a kHistogramSize(256) element array to count // occurrences of each pixel value. -void HistogramRect(Image src_pix, int channel, int left, int top, int width, int height, +void HistogramRect(const Image &src_pix, int channel, int left, int top, int width, int height, int *histogram); // Computes the Otsu threshold(s) for the given histogram. diff --git a/src/ccstruct/pageres.cpp b/src/ccstruct/pageres.cpp index 72de153b78..6996bca6d8 100644 --- a/src/ccstruct/pageres.cpp +++ b/src/ccstruct/pageres.cpp @@ -193,7 +193,7 @@ ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) { WERD_RES &WERD_RES::operator=(const WERD_RES &source) { if (this != &source) { - this->ELIST_LINK::operator=(source); + this->ELIST::LINK::operator=(source); Clear(); if (source.combination) { word = new WERD; diff --git a/src/ccstruct/pageres.h b/src/ccstruct/pageres.h index e7e3c2617a..41fa715810 100644 --- a/src/ccstruct/pageres.h +++ b/src/ccstruct/pageres.h @@ -117,7 +117,7 @@ class PAGE_RES { // page result * BLOCK_RES - Block results *************************************************************************/ -class BLOCK_RES : public ELIST_LINK { +class BLOCK_RES : public ELIST::LINK { public: BLOCK *block = nullptr; // real block int32_t char_count = 0; // chars in block @@ -141,7 +141,7 @@ class BLOCK_RES : public ELIST_LINK { * ROW_RES - Row results *************************************************************************/ -class ROW_RES : public ELIST_LINK { +class ROW_RES : public ELIST::LINK { public: ROW *row = nullptr; // real row int32_t char_count = 0; // chars in block @@ -163,7 +163,7 @@ enum CRUNCH_MODE { CR_NONE, CR_KEEP_SPACE, CR_LOOSE_SPACE, CR_DELETE }; // WERD_RES is a collection of publicly accessible members that gathers // information about a word result. -class TESS_API WERD_RES : public ELIST_LINK { +class TESS_API WERD_RES : public ELIST::LINK { public: // Which word is which? // There are 3 coordinate spaces in use here: a possibly rotated pixel space, @@ -347,7 +347,7 @@ class TESS_API WERD_RES : public ELIST_LINK { } // Deep copies everything except the ratings MATRIX. // To get that use deep_copy below. - WERD_RES(const WERD_RES &source) : ELIST_LINK(source) { + WERD_RES(const WERD_RES &source) : ELIST::LINK(source) { // combination is used in function Clear which is called from operator=. combination = false; *this = source; // see operator= diff --git a/src/ccstruct/points.h b/src/ccstruct/points.h index afe741875e..5a35e300d9 100644 --- a/src/ccstruct/points.h +++ b/src/ccstruct/points.h @@ -168,7 +168,7 @@ class ICOORD { TDimension ycoord; ///< y value }; -class ICOORDELT : public ELIST_LINK, +class ICOORDELT : public ELIST::LINK, public ICOORD // embedded coord list { diff --git a/src/ccstruct/polyblk.cpp b/src/ccstruct/polyblk.cpp index 5fbca3d80b..e270e2ce03 100644 --- a/src/ccstruct/polyblk.cpp +++ b/src/ccstruct/polyblk.cpp @@ -32,8 +32,6 @@ namespace tesseract { #define INTERSECTING INT16_MAX -int lessthan(const void *first, const void *second); - POLY_BLOCK::POLY_BLOCK(ICOORDELT_LIST *points, PolyBlockType t) { ICOORDELT_IT v = &vertices; @@ -357,13 +355,12 @@ ICOORDELT_LIST *PB_LINE_IT::get_line(TDimension y) { } if (!r.empty()) { - r.sort(lessthan); + r.sort([](const ICOORDELT *p1, const ICOORDELT *p2) { + // https://stackoverflow.com/questions/47466358/what-is-the-spaceship-three-way-comparison-operator-in-c + return (p1->x() <=> p2->x()); + }); + // TODO: remove loop after checking its history. -#if 0 - for (r.mark_cycle_pt(); !r.cycled_list(); r.forward()) { - x = r.data(); - } -#endif for (r.mark_cycle_pt(); !r.cycled_list(); r.forward()) { r.data()->set_y(r.data_relative(1)->x() - r.data()->x()); r.forward(); @@ -374,19 +371,6 @@ ICOORDELT_LIST *PB_LINE_IT::get_line(TDimension y) { return result; } -int lessthan(const void *first, const void *second) { - const ICOORDELT *p1 = *reinterpret_cast(first); - const ICOORDELT *p2 = *reinterpret_cast(second); - - if (p1->x() < p2->x()) { - return (-1); - } else if (p1->x() > p2->x()) { - return (1); - } else { - return (0); - } -} - #if !GRAPHICS_DISABLED /// Returns a color to draw the given type. Diagnostics::Color POLY_BLOCK::ColorForPolyBlockType(PolyBlockType type) { diff --git a/src/ccstruct/ratngs.cpp b/src/ccstruct/ratngs.cpp index e7803e2caa..fa20d56c43 100644 --- a/src/ccstruct/ratngs.cpp +++ b/src/ccstruct/ratngs.cpp @@ -108,7 +108,7 @@ BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id * * Constructor to build a BLOB_CHOICE from another BLOB_CHOICE. */ -BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) : ELIST_LINK(other) { +BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) : ELIST::LINK(other) { unichar_id_ = other.unichar_id(); rating_ = other.rating(); certainty_ = other.certainty(); @@ -127,7 +127,7 @@ BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) : ELIST_LINK(other) { // Copy assignment operator. BLOB_CHOICE &BLOB_CHOICE::operator=(const BLOB_CHOICE &other) { - ELIST_LINK::operator=(other); + ELIST::LINK::operator=(other); unichar_id_ = other.unichar_id(); rating_ = other.rating(); certainty_ = other.certainty(); diff --git a/src/ccstruct/ratngs.h b/src/ccstruct/ratngs.h index b2180d132f..4d5cdbde32 100644 --- a/src/ccstruct/ratngs.h +++ b/src/ccstruct/ratngs.h @@ -53,7 +53,7 @@ enum BlobChoiceClassifier { }; DECL_FMT_FORMAT_TESSENUMTYPE(BlobChoiceClassifier); -class BLOB_CHOICE : public ELIST_LINK { +class BLOB_CHOICE : public ELIST::LINK { public: BLOB_CHOICE() { unichar_id_ = UNICHAR_SPACE; @@ -259,7 +259,7 @@ enum ScriptPos { SP_NORMAL, SP_SUBSCRIPT, SP_SUPERSCRIPT, SP_DROPCAP }; const char *ScriptPosToString(ScriptPos script_pos); -class TESS_API WERD_CHOICE : public ELIST_LINK { +class TESS_API WERD_CHOICE : public ELIST::LINK { public: static const float kBadRating; static const char *permuter_name(uint8_t permuter); @@ -276,7 +276,7 @@ class TESS_API WERD_CHOICE : public ELIST_LINK { this->init(src_string, src_lengths, src_rating, src_certainty, src_permuter); } WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset); - WERD_CHOICE(const WERD_CHOICE &word) : ELIST_LINK(word), unicharset_(word.unicharset_) { + WERD_CHOICE(const WERD_CHOICE &word) : ELIST::LINK(word), unicharset_(word.unicharset_) { this->init(word.length()); this->operator=(word); } diff --git a/src/ccstruct/rect.h b/src/ccstruct/rect.h index 97cc86a631..ec4aec219c 100644 --- a/src/ccstruct/rect.h +++ b/src/ccstruct/rect.h @@ -128,7 +128,7 @@ class TESS_API TBOX { // bounding box } } - TDimension width() const { // how high is it? + TDimension width() const { // how wide is it? if (!null_box()) { return top_right.x() - bot_left.x(); } else { diff --git a/src/ccstruct/stepblob.h b/src/ccstruct/stepblob.h index c8a542a980..2d80b5d7a3 100644 --- a/src/ccstruct/stepblob.h +++ b/src/ccstruct/stepblob.h @@ -37,7 +37,7 @@ class DENORM; ELISTIZEH(C_BLOB); -class TESS_API C_BLOB : public ELIST_LINK { +class TESS_API C_BLOB : public ELIST::LINK { public: C_BLOB() = default; explicit C_BLOB(C_OUTLINE_LIST *outline_list); @@ -136,9 +136,7 @@ class TESS_API C_BLOB : public ELIST_LINK { return blob; } - static int SortByXMiddle(const void *v1, const void *v2) { - const C_BLOB *blob1 = *static_cast(v1); - const C_BLOB *blob2 = *static_cast(v2); + static int SortByXMiddle(const C_BLOB *blob1, const C_BLOB *blob2) { return blob1->bounding_box().x_middle() - blob2->bounding_box().x_middle(); } diff --git a/src/ccstruct/werd.cpp b/src/ccstruct/werd.cpp index be147723ed..7d9d10a2e8 100644 --- a/src/ccstruct/werd.cpp +++ b/src/ccstruct/werd.cpp @@ -354,7 +354,7 @@ WERD *WERD::shallow_copy() { */ WERD &WERD::operator=(const WERD &source) { - this->ELIST2_LINK::operator=(source); + this->ELIST2::LINK::operator=(source); blanks = source.blanks; flags = source.flags; script_id_ = source.script_id_; @@ -373,9 +373,7 @@ WERD &WERD::operator=(const WERD &source) { * order of left edge. */ -int word_comparator(const void *word1p, const void *word2p) { - const WERD *word1 = *reinterpret_cast(word1p); - const WERD *word2 = *reinterpret_cast(word2p); +int word_comparator(const WERD *word1, const WERD *word2) { return word1->bounding_box().left() - word2->bounding_box().left(); } diff --git a/src/ccstruct/werd.h b/src/ccstruct/werd.h index 619c2e491f..8fcd71889b 100644 --- a/src/ccstruct/werd.h +++ b/src/ccstruct/werd.h @@ -59,7 +59,7 @@ enum DISPLAY_FLAGS { class ROW; // forward decl -class TESS_API WERD : public ELIST2_LINK { +class TESS_API WERD : public ELIST2::LINK { public: WERD() = default; // WERD constructed with: @@ -212,7 +212,7 @@ ELIST2IZEH(WERD); namespace tesseract { // compare words by increasing order of left edge, suitable for qsort(3) -int word_comparator(const void *word1p, const void *word2p); +int word_comparator(const WERD *word1, const WERD *word2); } // namespace tesseract diff --git a/src/ccutil/ambigs.cpp b/src/ccutil/ambigs.cpp index fec2ad9361..f0006e20ed 100644 --- a/src/ccutil/ambigs.cpp +++ b/src/ccutil/ambigs.cpp @@ -44,12 +44,9 @@ static const char kAmbigDelimiters[] = "\t "; // UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1); -AmbigSpec::AmbigSpec() { +AmbigSpec::AmbigSpec() : correct_ngram_id(INVALID_UNICHAR_ID), type(NOT_AMBIG), wrong_ngram_size(0) { wrong_ngram[0] = INVALID_UNICHAR_ID; correct_fragments[0] = INVALID_UNICHAR_ID; - correct_ngram_id = INVALID_UNICHAR_ID; - type = NOT_AMBIG; - wrong_ngram_size = 0; } // Initializes the ambigs by adding a nullptr pointer to each table. diff --git a/src/ccutil/ambigs.h b/src/ccutil/ambigs.h index b63bf231e8..1d6f7489de 100644 --- a/src/ccutil/ambigs.h +++ b/src/ccutil/ambigs.h @@ -107,7 +107,7 @@ class UnicharIdArrayUtils { // AMBIG_SPEC_LIST stores a list of dangerous ambigs that // start with the same unichar (e.g. r->t rn->m rr1->m). -class AmbigSpec : public ELIST_LINK { +class AmbigSpec : public ELIST::LINK { public: AmbigSpec(); ~AmbigSpec() = default; @@ -115,9 +115,7 @@ class AmbigSpec : public ELIST_LINK { // Comparator function for sorting AmbigSpec_LISTs. The lists will // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors // in a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1]. - static int compare_ambig_specs(const void *spec1, const void *spec2) { - const AmbigSpec *s1 = *static_cast(spec1); - const AmbigSpec *s2 = *static_cast(spec2); + static int compare_ambig_specs(const AmbigSpec *s1, const AmbigSpec *s2) { int result = UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram); if (result != 0) { return result; diff --git a/src/ccutil/ccutil.cpp b/src/ccutil/ccutil.cpp index aea244a2a5..4804ac7b84 100644 --- a/src/ccutil/ccutil.cpp +++ b/src/ccutil/ccutil.cpp @@ -12,18 +12,14 @@ // Include automatically generated configuration file if running autoconf. #include // compiler config, etc. +#include #include "ccutil.h" -#include "winutils.h" +#include "tesserrstream.h" // for tesserr #include "pathutils.h" #include "helpers.h" -#if defined(_WIN32) -# include // for _access -#endif - #include -#include // for std::strrchrA #include // for std::filesystem @@ -189,7 +185,7 @@ static bool determine_datadir(std::string &datadir, const std::string &argv0, co } #if defined(_WIN32) - if (datadir.empty() || _access(datadir.c_str(), 0) != 0) { + if (datadir.empty() || !std::filesystem::exists(datadir)) { /* Look for tessdata in directory of executable. */ wchar_t pth[MAX_PATH]; DWORD length = GetModuleFileNameW(nullptr, pth, MAX_PATH); @@ -266,4 +262,74 @@ int CCUtil::main_setup(const std::string &argv0, const std::string &output_image return 0; } +/** + * @brief Finds the path to the tessdata directory. + * + * This function determines the location of the tessdata directory based on the + * following order of precedence: + * 1. If `argv0` is provided, use it. + * 2. If `TESSDATA_PREFIX` environment variable is set and the path exists, use + * it. + * 3. On Windows, check for a "tessdata" directory in the executable's directory + * and use it. + * 4. If `TESSDATA_PREFIX` is defined at compile time, use it. + * 5. Otherwise, use the current working directory. + * + * @param argv0 argument to be considered as the data directory path. + * @return The path to the tessdata directory or current directory. + */ +std::filesystem::path find_data_path(const std::string &argv0) { + // If argv0 is set, always use it even if it is not a valid directory + if (!argv0.empty()) { + std::filesystem::path path(argv0); + if (!std::filesystem::is_directory(path)) { + tprintWarn("(tessdata): '{}' is not a valid directory.\n", argv0); + } + return path; + } + + // Check environment variable if argv0 is not specified + if (const char *tessdata_prefix = std::getenv("TESSDATA_PREFIX")) { + std::filesystem::path path(tessdata_prefix); + if (std::filesystem::exists(path)) { + return path; + } else { + tprintWarn("TESSDATA_PREFIX '{}' does not exist, ignoring.\n", + tessdata_prefix); + } + } + +#ifdef _WIN32 + // Windows-specific: check for 'tessdata' not existing in the executable + // directory + wchar_t path[MAX_PATH]; + if (DWORD length = GetModuleFileNameW(nullptr, path, MAX_PATH); + length > 0 && length < MAX_PATH) { + std::filesystem::path exe_path(path); + auto tessdata_subdir = exe_path.parent_path() / "tessdata"; + if (std::filesystem::exists(tessdata_subdir)) { + return tessdata_subdir; + } + } +#endif + + // Fallback to compile-time or current directory +#ifdef TESSDATA_PREFIX + return std::filesystem::path(TESSDATA_PREFIX) / "tessdata"; +#else + return std::filesystem::current_path(); +#endif +} + + +/** + * @brief CCUtil::main_setup - set location of tessdata and name of image + * + * @param argv0 - paths to the directory with language files and config files. + */ +void CCUtil::main_setup(const std::string &argv0, const std::string &basename) { + imagebasename_ = basename; /**< name of image */ + datadir_ = find_data_path(argv0); +} + } // namespace tesseract diff --git a/src/ccutil/ccutil.h b/src/ccutil/ccutil.h index 83e95929d1..8ae9c0e882 100644 --- a/src/ccutil/ccutil.h +++ b/src/ccutil/ccutil.h @@ -19,6 +19,8 @@ #ifndef TESSERACT_CCUTIL_CCUTIL_H_ #define TESSERACT_CCUTIL_CCUTIL_H_ +#include // for std::filesystem + #include // compiler config, etc. #if !(defined(WIN32) || defined(_WIN32) || defined(_WIN64)) @@ -81,7 +83,7 @@ class TESS_API CCUtil { std::string visible_image_file_path_; // name of currently input file, used for visible overlays only std::string input_file_path_; // name of currently processed input file - std::string datadir_; // dir for data files + std::filesystem::path datadir_; // dir for data files std::string imagebasename_; // name of image std::string lang_; std::string language_data_path_prefix_; diff --git a/src/ccutil/clst.cpp b/src/ccutil/clst.cpp deleted file mode 100644 index 9fd5987d29..0000000000 --- a/src/ccutil/clst.cpp +++ /dev/null @@ -1,450 +0,0 @@ -/********************************************************************** - * File: clst.cpp (Formerly clist.c) - * Description: CONS cell list handling code which is not in the include file. - * Author: Phil Cheatle - * - * (C) Copyright 1991, Hewlett-Packard Ltd. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include // compiler config, etc. - -#include "clst.h" -#include - -namespace tesseract { - -/*********************************************************************** - * CLIST::internal_deep_clear - * - * Used by the "deep_clear" member function of derived list - * classes to destroy all the elements on the list. - * The calling function passes a "zapper" function which can be called to - * delete each data element of the list, regardless of its class. This - * technique permits a generic clear function to destroy elements of - * different derived types correctly, without requiring virtual functions and - * the consequential memory overhead. - **********************************************************************/ - -void CLIST::internal_deep_clear( // destroy all links - void (*zapper)(void *)) { // ptr to zapper functn - if (!empty()) { - auto ptr = last->next; // set to first - last->next = nullptr; // break circle - last = nullptr; // set list empty - while (ptr) { - auto next = ptr->next; - zapper(ptr->data); - delete (ptr); - ptr = next; - } - } -} - -/*********************************************************************** - * CLIST::shallow_clear - * - * Used by the destructor and the "shallow_clear" member function of derived - * list classes to destroy the list. - * The data elements are NOT destroyed. - * - **********************************************************************/ - -void CLIST::shallow_clear() { // destroy all links - if (!empty()) { - auto ptr = last->next; // set to first - last->next = nullptr; // break circle - last = nullptr; // set list empty - while (ptr) { - auto next = ptr->next; - delete (ptr); - ptr = next; - } - } -} - -/*********************************************************************** - * CLIST::assign_to_sublist - * - * The list is set to a sublist of another list. "This" list must be empty - * before this function is invoked. The two iterators passed must refer to - * the same list, different from "this" one. The sublist removed is the - * inclusive list from start_it's current position to end_it's current - * position. If this range passes over the end of the source list then the - * source list has its end set to the previous element of start_it. The - * extracted sublist is unaffected by the end point of the source list, its - * end point is always the end_it position. - **********************************************************************/ - -void CLIST::assign_to_sublist( // to this list - CLIST_ITERATOR *start_it, // from list start - CLIST_ITERATOR *end_it) { // from list end - constexpr ERRCODE LIST_NOT_EMPTY("Destination list must be empty before extracting a sublist"); - - if (!empty()) { - LIST_NOT_EMPTY.abort("CLIST.assign_to_sublist"); - } - - last = start_it->extract_sublist(end_it); -} - -/*********************************************************************** - * CLIST::sort - * - * Sort elements on list - **********************************************************************/ - -void CLIST::sort( // sort elements - int comparator( // comparison routine - const void *, const void *)) { - // Allocate an array of pointers, one per list element. - auto count = length(); - if (count > 0) { - // ptr array to sort - std::vector base; - base.reserve(count); - - CLIST_ITERATOR it(this); - - // Extract all elements, putting the pointers in the array. - for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { - base.push_back(it.extract()); - } - - // Sort the pointer array. - qsort(&base[0], count, sizeof(base[0]), comparator); - - // Rebuild the list from the sorted pointers. - for (auto current : base) { - it.add_to_end(current); - } - } -} - -// Assuming list has been sorted already, insert new_data to -// keep the list sorted according to the same comparison function. -// Comparison function is the same as used by sort, i.e. uses double -// indirection. Time is O(1) to add to beginning or end. -// Time is linear to add pre-sorted items to an empty list. -// If unique, then don't add duplicate entries. -// Returns true if the element was added to the list. -bool CLIST::add_sorted(int comparator(const void *, const void *), bool unique, void *new_data) { - // Check for adding at the end. - if (last == nullptr || comparator(&last->data, &new_data) < 0) { - auto *new_element = new CLIST_LINK; - new_element->data = new_data; - if (last == nullptr) { - new_element->next = new_element; - } else { - new_element->next = last->next; - last->next = new_element; - } - last = new_element; - return true; - } else if (!unique || last->data != new_data) { - // Need to use an iterator. - CLIST_ITERATOR it(this); - for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { - void *data = it.data(); - if (data == new_data && unique) { - return false; - } - if (comparator(&data, &new_data) > 0) { - break; - } - } - if (it.cycled_list()) { - it.add_to_end(new_data); - } else { - it.add_before_then_move(new_data); - } - return true; - } - return false; -} - -// Assuming that the minuend and subtrahend are already sorted with -// the same comparison function, shallow clears this and then copies -// the set difference minuend - subtrahend to this, being the elements -// of minuend that do not compare equal to anything in subtrahend. -// If unique is true, any duplicates in minuend are also eliminated. -void CLIST::set_subtract(int comparator(const void *, const void *), bool unique, CLIST *minuend, - CLIST *subtrahend) { - shallow_clear(); - CLIST_ITERATOR m_it(minuend); - CLIST_ITERATOR s_it(subtrahend); - // Since both lists are sorted, finding the subtras that are not - // minus is a case of a parallel iteration. - for (m_it.mark_cycle_pt(); !m_it.cycled_list(); m_it.forward()) { - void *minu = m_it.data(); - void *subtra = nullptr; - if (!s_it.empty()) { - subtra = s_it.data(); - while (!s_it.at_last() && comparator(&subtra, &minu) < 0) { - s_it.forward(); - subtra = s_it.data(); - } - } - if (subtra == nullptr || comparator(&subtra, &minu) != 0) { - add_sorted(comparator, unique, minu); - } - } -} - -/*********************************************************************** - * MEMBER FUNCTIONS OF CLASS: CLIST_ITERATOR - * ========================================= - **********************************************************************/ - -/*********************************************************************** - * CLIST_ITERATOR::forward - * - * Move the iterator to the next element of the list. - * REMEMBER: ALL LISTS ARE CIRCULAR. - **********************************************************************/ - -void *CLIST_ITERATOR::forward() { - if (list->empty()) { - return nullptr; - } - - if (current) { // not removed so - // set previous - prev = current; - started_cycling = true; - // In case next is deleted by another iterator, get next from current. - current = current->next; - } else { - if (ex_current_was_cycle_pt) { - cycle_pt = next; - } - current = next; - } - - next = current->next; - return current->data; -} - -/*********************************************************************** - * CLIST_ITERATOR::data_relative - * - * Return the data pointer to the element "offset" elements from current. - * "offset" must not be less than -1. - * (This function can't be INLINEd because it contains a loop) - **********************************************************************/ - -void *CLIST_ITERATOR::data_relative( // get data + or - ... - int8_t offset) { // offset from current - CLIST_LINK *ptr; - -#ifndef NDEBUG - if (!list) - NO_LIST.abort("CLIST_ITERATOR::data_relative"); - if (list->empty()) - EMPTY_LIST.abort("CLIST_ITERATOR::data_relative"); - if (offset < -1) - BAD_PARAMETER.abort("CLIST_ITERATOR::data_relative", "offset < -l"); -#endif - - if (offset == -1) { - ptr = prev; - } else { - for (ptr = current ? current : prev; offset-- > 0; ptr = ptr->next) { -#ifndef NDEBUG - if (!ptr) - BAD_PARAMETER.error("CLIST_ITERATOR::data_relative", ABORT, "ptr == nullptr"); -#endif - ; - } - } - - return ptr->data; -} - -/*********************************************************************** - * CLIST_ITERATOR::move_to_last() - * - * Move current so that it is set to the end of the list. - * Return data just in case anyone wants it. - * (This function can't be INLINEd because it contains a loop) - **********************************************************************/ - -void *CLIST_ITERATOR::move_to_last() { - while (current != list->last) { - forward(); - } - - if (current == nullptr) { - return nullptr; - } else { - return current->data; - } -} - -/*********************************************************************** - * CLIST_ITERATOR::exchange() - * - * Given another iterator, whose current element is a different element on - * the same list list OR an element of another list, exchange the two current - * elements. On return, each iterator points to the element which was the - * other iterators current on entry. - * (This function hasn't been in-lined because its a bit big!) - **********************************************************************/ - -void CLIST_ITERATOR::exchange( // positions of 2 links - CLIST_ITERATOR *other_it) { // other iterator - constexpr ERRCODE DONT_EXCHANGE_DELETED("Can't exchange deleted elements of lists"); - - /* Do nothing if either list is empty or if both iterators reference the same -link */ - - if ((list->empty()) || (other_it->list->empty()) || (current == other_it->current)) { - return; - } - - /* Error if either current element is deleted */ - - if (!current || !other_it->current) { - DONT_EXCHANGE_DELETED.abort("CLIST_ITERATOR.exchange"); - } - - /* Now handle the 4 cases: doubleton list; non-doubleton adjacent elements -(other before this); non-doubleton adjacent elements (this before other); -non-adjacent elements. */ - - // adjacent links - if ((next == other_it->current) || (other_it->next == current)) { - // doubleton list - if ((next == other_it->current) && (other_it->next == current)) { - prev = next = current; - other_it->prev = other_it->next = other_it->current; - } else { // non-doubleton with - // adjacent links - // other before this - if (other_it->next == current) { - other_it->prev->next = current; - other_it->current->next = next; - current->next = other_it->current; - other_it->next = other_it->current; - prev = current; - } else { // this before other - prev->next = other_it->current; - current->next = other_it->next; - other_it->current->next = current; - next = current; - other_it->prev = other_it->current; - } - } - } else { // no overlap - prev->next = other_it->current; - current->next = other_it->next; - other_it->prev->next = current; - other_it->current->next = next; - } - - /* update end of list pointer when necessary (remember that the 2 iterators - may iterate over different lists!) */ - - if (list->last == current) { - list->last = other_it->current; - } - if (other_it->list->last == other_it->current) { - other_it->list->last = current; - } - - if (current == cycle_pt) { - cycle_pt = other_it->cycle_pt; - } - if (other_it->current == other_it->cycle_pt) { - other_it->cycle_pt = cycle_pt; - } - - /* The actual exchange - in all cases*/ - - auto old_current = current; - current = other_it->current; - other_it->current = old_current; -} - -/*********************************************************************** - * CLIST_ITERATOR::extract_sublist() - * - * This is a private member, used only by CLIST::assign_to_sublist. - * Given another iterator for the same list, extract the links from THIS to - * OTHER inclusive, link them into a new circular list, and return a - * pointer to the last element. - * (Can't inline this function because it contains a loop) - **********************************************************************/ - -CLIST_LINK *CLIST_ITERATOR::extract_sublist( // from this current - CLIST_ITERATOR *other_it) { // to other current - CLIST_ITERATOR temp_it = *this; - - constexpr ERRCODE BAD_SUBLIST("Can't find sublist end point in original list"); -#ifndef NDEBUG - constexpr ERRCODE BAD_EXTRACTION_PTS("Can't extract sublist from points on different lists"); - constexpr ERRCODE DONT_EXTRACT_DELETED("Can't extract a sublist marked by deleted points"); - - if (list != other_it->list) - BAD_EXTRACTION_PTS.abort("CLIST_ITERATOR.extract_sublist"); - if (list->empty()) - EMPTY_LIST.abort("CLIST_ITERATOR::extract_sublist"); - - if (!current || !other_it->current) - DONT_EXTRACT_DELETED.abort("CLIST_ITERATOR.extract_sublist"); -#endif - - ex_current_was_last = other_it->ex_current_was_last = false; - ex_current_was_cycle_pt = false; - other_it->ex_current_was_cycle_pt = false; - - temp_it.mark_cycle_pt(); - do { // walk sublist - if (temp_it.cycled_list()) { // can't find end pt - BAD_SUBLIST.abort("CLIST_ITERATOR.extract_sublist"); - } - - if (temp_it.at_last()) { - list->last = prev; - ex_current_was_last = other_it->ex_current_was_last = true; - } - - if (temp_it.current == cycle_pt) { - ex_current_was_cycle_pt = true; - } - - if (temp_it.current == other_it->cycle_pt) { - other_it->ex_current_was_cycle_pt = true; - } - - temp_it.forward(); - } while (temp_it.prev != other_it->current); - - // circularise sublist - other_it->current->next = current; - auto end_of_new_list = other_it->current; - - // sublist = whole list - if (prev == other_it->current) { - list->last = nullptr; - prev = current = next = nullptr; - other_it->prev = other_it->current = other_it->next = nullptr; - } else { - prev->next = other_it->next; - current = other_it->current = nullptr; - next = other_it->next; - other_it->prev = prev; - } - return end_of_new_list; -} - -} // namespace tesseract diff --git a/src/ccutil/clst.h b/src/ccutil/clst.h index b00053de05..604cde16e7 100644 --- a/src/ccutil/clst.h +++ b/src/ccutil/clst.h @@ -19,706 +19,983 @@ #ifndef CLST_H #define CLST_H -#include "list.h" #include "lsterr.h" #include "serialis.h" +#include #include namespace tesseract { -class CLIST_ITERATOR; - -/********************************************************************** - * CLASS - CLIST_LINK - * - * Generic link class for singly linked CONS cell lists - * - * Note: No destructor - elements are assumed to be destroyed EITHER after - * they have been extracted from a list OR by the CLIST destructor which - * walks the list. - **********************************************************************/ - -class CLIST_LINK { - friend class CLIST_ITERATOR; - friend class CLIST; - - CLIST_LINK *next; - void *data; - -public: - CLIST_LINK() { // constructor - data = next = nullptr; - } - - CLIST_LINK(const CLIST_LINK &) = delete; - void operator=(const CLIST_LINK &) = delete; -}; - /********************************************************************** * CLASS - CLIST * * Generic list class for singly linked CONS cell lists **********************************************************************/ -class TESS_API CLIST { - friend class CLIST_ITERATOR; - - CLIST_LINK *last = nullptr; // End of list - - //(Points to head) - CLIST_LINK *First() { // return first - return last != nullptr ? last->next : nullptr; - } - - const CLIST_LINK *First() const { // return first - return last != nullptr ? last->next : nullptr; - } +template +class ConsList { + friend class Link; public: - ~CLIST() { // destructor - shallow_clear(); - } + /********************************************************************** + * CLASS - Link + * + * Generic link class for singly linked CONS cell lists + * + * Note: No destructor - elements are assumed to be destroyed EITHER after + * they have been extracted from a list OR by the ConsList destructor which + * walks the list. + **********************************************************************/ + struct Link { + Link *next{}; + T *data{}; + + Link() = default; + Link(const Link &) = delete; + void operator=(const Link &) = delete; + }; + + /*********************************************************************** + * CLASS - Iterator + * + * Generic iterator class for singly linked lists with embedded + *links + **********************************************************************/ + class Iterator { + ConsList *list = nullptr; // List being iterated + Link *prev = nullptr; // prev element + Link *current = nullptr; // current element + Link *next = nullptr; // next element + Link *cycle_pt = nullptr; // point we are cycling the list to. + bool ex_current_was_last = false; // current extracted was end of list + bool ex_current_was_cycle_pt = false; // current extracted was cycle point + bool started_cycling = false; // Have we moved off the start? + + /*********************************************************************** + * Iterator::extract_sublist() + * + * This is a private member, used only by ConsList::assign_to_sublist. + * Given another iterator for the same list, extract the links from THIS to + * OTHER inclusive, link them into a new circular list, and return a + * pointer to the last element. + * (Can't inline this function because it contains a loop) + **********************************************************************/ + Link *extract_sublist( // from this current + Iterator *other_it) { // to other current + Iterator temp_it = *this; + + constexpr ERRCODE BAD_SUBLIST("Can't find sublist end point in original list"); +#ifndef NDEBUG + constexpr ERRCODE BAD_EXTRACTION_PTS("Can't extract sublist from points on different lists"); + constexpr ERRCODE DONT_EXTRACT_DELETED("Can't extract a sublist marked by deleted points"); - void internal_deep_clear( // destroy all links - void (*zapper)(void *)); // ptr to zapper functn + if (list != other_it->list) + BAD_EXTRACTION_PTS.error("Iterator.extract_sublist", ABORT); + if (list->empty()) + EMPTY_LIST.error("Iterator::extract_sublist", ABORT); - void shallow_clear(); // clear list but don't - // delete data elements + if (!current || !other_it->current) + DONT_EXTRACT_DELETED.error("Iterator.extract_sublist", ABORT); +#endif - bool empty() const { // is list empty? - return !last; - } + ex_current_was_last = other_it->ex_current_was_last = false; + ex_current_was_cycle_pt = false; + other_it->ex_current_was_cycle_pt = false; - bool singleton() const { - return last != nullptr ? (last == last->next) : false; - } + temp_it.mark_cycle_pt(); + do { // walk sublist + if (temp_it.cycled_list()) { // can't find end pt + BAD_SUBLIST.error("Iterator.extract_sublist", ABORT); + } - void shallow_copy( // dangerous!! - CLIST *from_list) { // beware destructors!! - last = from_list->last; - } + if (temp_it.at_last()) { + list->last = prev; + ex_current_was_last = other_it->ex_current_was_last = true; + } - void assign_to_sublist( // to this list - CLIST_ITERATOR *start_it, // from list start - CLIST_ITERATOR *end_it); // from list end + if (temp_it.current == cycle_pt) { + ex_current_was_cycle_pt = true; + } - int32_t length() const { //# elements in list - int32_t count = 0; - if (last != nullptr) { - count = 1; - for (auto it = last->next; it != last; it = it->next) { - count++; + if (temp_it.current == other_it->cycle_pt) { + other_it->ex_current_was_cycle_pt = true; + } + + temp_it.forward(); + } while (temp_it.prev != other_it->current); + + // circularise sublist + other_it->current->next = current; + auto end_of_new_list = other_it->current; + + // sublist = whole list + if (prev == other_it->current) { + list->last = nullptr; + prev = current = next = nullptr; + other_it->prev = other_it->current = other_it->next = nullptr; + } else { + prev->next = other_it->next; + current = other_it->current = nullptr; + next = other_it->next; + other_it->prev = prev; } + return end_of_new_list; } - return count; - } - void sort( // sort elements - int comparator( // comparison routine - const void *, const void *)); - - // Assuming list has been sorted already, insert new_data to - // keep the list sorted according to the same comparison function. - // Comparison function is the same as used by sort, i.e. uses double - // indirection. Time is O(1) to add to beginning or end. - // Time is linear to add pre-sorted items to an empty list. - // If unique, then don't add duplicate entries. - // Returns true if the element was added to the list. - bool add_sorted(int comparator(const void *, const void *), bool unique, void *new_data); - - // Assuming that the minuend and subtrahend are already sorted with - // the same comparison function, shallow clears this and then copies - // the set difference minuend - subtrahend to this, being the elements - // of minuend that do not compare equal to anything in subtrahend. - // If unique is true, any duplicates in minuend are also eliminated. - void set_subtract(int comparator(const void *, const void *), bool unique, CLIST *minuend, - CLIST *subtrahend); -}; - -/*********************************************************************** - * CLASS - CLIST_ITERATOR - * - * Generic iterator class for singly linked lists with embedded - *links - **********************************************************************/ - -class TESS_API CLIST_ITERATOR { - friend void CLIST::assign_to_sublist(CLIST_ITERATOR *, CLIST_ITERATOR *); - - CLIST *list; // List being iterated - CLIST_LINK *prev; // prev element - CLIST_LINK *current; // current element - CLIST_LINK *next; // next element - CLIST_LINK *cycle_pt; // point we are cycling the list to. - bool ex_current_was_last; // current extracted was end of list - bool ex_current_was_cycle_pt; // current extracted was cycle point - bool started_cycling; // Have we moved off the start? - - CLIST_LINK *extract_sublist( // from this current... - CLIST_ITERATOR *other_it); // to other current - -public: - CLIST_ITERATOR() { // constructor - list = nullptr; - prev = nullptr; - current = nullptr; - next = nullptr; - cycle_pt = nullptr; - ex_current_was_last = false; - ex_current_was_cycle_pt = false; - started_cycling = false; - } // unassigned list - - CLIST_ITERATOR( // constructor - CLIST *list_to_iterate); - - void set_to_list( // change list - CLIST *list_to_iterate); - - void add_after_then_move( // add after current & - void *new_data); // move to new - - void add_after_stay_put( // add after current & - void *new_data); // stay at current - - void add_before_then_move( // add before current & - void *new_data); // move to new - - void add_before_stay_put( // add before current & - void *new_data); // stay at current - - void add_list_after( // add a list & - CLIST *list_to_add); // stay at current + public: + Iterator() { // constructor + list = nullptr; + } // unassigned list + + /*********************************************************************** + * Iterator::Iterator + * + * CONSTRUCTOR - set iterator to specified list; + **********************************************************************/ + Iterator( // constructor + ConsList *list_to_iterate) { + set_to_list(list_to_iterate); + } - void add_list_before( // add a list & - CLIST *list_to_add); // move to it 1st item + /*********************************************************************** + * Iterator::set_to_list + * + * (Re-)initialise the iterator to point to the start of the list_to_iterate + * over. + **********************************************************************/ + void set_to_list( // change list + ConsList *list_to_iterate) { + list = list_to_iterate; + prev = list->last; + current = list->First(); + next = current != nullptr ? current->next : nullptr; + cycle_pt = nullptr; // await explicit set + started_cycling = false; + ex_current_was_last = false; + ex_current_was_cycle_pt = false; + } - void *data() { // get current data + /*********************************************************************** + * Iterator::add_after_then_move + * + * Add a new element to the list after the current element and move the + * iterator to the new element. + **********************************************************************/ + void add_after_then_move( // add after current & + T *new_data) { #ifndef NDEBUG - if (!list) { - NO_LIST.abort("CLIST_ITERATOR::data"); - } + if (!new_data) { + BAD_PARAMETER.error("Iterator::add_after_then_move", ABORT, "new_data is nullptr"); + } #endif - return current->data; - } - - void *data_relative( // get data + or - ... - int8_t offset); // offset from current - - void *forward(); // move to next element - - void *extract(); // remove from list - void *move_to_first(); // go to start of list + auto new_element = new Link; + new_element->data = new_data; - void *move_to_last(); // go to end of list + if (list->empty()) { + new_element->next = new_element; + list->last = new_element; + prev = next = new_element; + } else { + new_element->next = next; + + if (current) { // not extracted + current->next = new_element; + prev = current; + if (current == list->last) { + list->last = new_element; + } + } else { // current extracted + prev->next = new_element; + if (ex_current_was_last) { + list->last = new_element; + } + if (ex_current_was_cycle_pt) { + cycle_pt = new_element; + } + } + } + current = new_element; + } // move to new + + /*********************************************************************** + * Iterator::add_after_stay_put + * + * Add a new element to the list after the current element but do not move + * the iterator to the new element. + **********************************************************************/ + void add_after_stay_put( // add after current & + T *new_data) { +#ifndef NDEBUG + if (!new_data) { + BAD_PARAMETER.error("Iterator::add_after_stay_put", ABORT, "new_data is nullptr"); + } +#endif - void mark_cycle_pt(); // remember current + auto new_element = new Link; + new_element->data = new_data; - bool empty() const { // is list empty? - return list->empty(); - } + if (list->empty()) { + new_element->next = new_element; + list->last = new_element; + prev = next = new_element; + ex_current_was_last = false; + current = nullptr; + } else { + new_element->next = next; + + if (current) { // not extracted + current->next = new_element; + if (prev == current) { + prev = new_element; + } + if (current == list->last) { + list->last = new_element; + } + } else { // current extracted + prev->next = new_element; + if (ex_current_was_last) { + list->last = new_element; + ex_current_was_last = false; + } + } + next = new_element; + } + } // stay at current + + /*********************************************************************** + * Iterator::add_before_then_move + * + * Add a new element to the list before the current element and move the + * iterator to the new element. + **********************************************************************/ + void add_before_then_move( // add before current & + T *new_data) { +#ifndef NDEBUG + if (!new_data) { + BAD_PARAMETER.error("Iterator::add_before_then_move", ABORT, "new_data is nullptr"); + } +#endif - bool current_extracted() const { // current extracted? - return !current; - } + auto new_element = new Link; + new_element->data = new_data; - bool at_first() const; // Current is first? + if (list->empty()) { + new_element->next = new_element; + list->last = new_element; + prev = next = new_element; + } else { + prev->next = new_element; + if (current) { // not extracted + new_element->next = current; + next = current; + } else { // current extracted + new_element->next = next; + if (ex_current_was_last) { + list->last = new_element; + } + if (ex_current_was_cycle_pt) { + cycle_pt = new_element; + } + } + } + current = new_element; + } // move to new + + /*********************************************************************** + * Iterator::add_before_stay_put + * + * Add a new element to the list before the current element but don't move the + * iterator to the new element. + **********************************************************************/ + void add_before_stay_put( // add before current & + T *new_data) { +#ifndef NDEBUG + if (!new_data) { + BAD_PARAMETER.error("Iterator::add_before_stay_put", ABORT, "new_data is nullptr"); + } +#endif - bool at_last() const; // Current is last? + auto new_element = new Link; + new_element->data = new_data; - bool cycled_list() const; // Completed a cycle? + if (list->empty()) { + new_element->next = new_element; + list->last = new_element; + prev = next = new_element; + ex_current_was_last = true; + current = nullptr; + } else { + prev->next = new_element; + if (current) { // not extracted + new_element->next = current; + if (next == current) { + next = new_element; + } + } else { // current extracted + new_element->next = next; + if (ex_current_was_last) { + list->last = new_element; + } + } + prev = new_element; + } + } // stay at current + + /*********************************************************************** + * Iterator::add_list_after + * + * Insert another list to this list after the current element but don't move + *the + * iterator. + **********************************************************************/ + void add_list_after( // add a list & + ConsList *list_to_add) { + if (!list_to_add->empty()) { + if (list->empty()) { + list->last = list_to_add->last; + prev = list->last; + next = list->First(); + ex_current_was_last = true; + current = nullptr; + } else { + if (current) { // not extracted + current->next = list_to_add->First(); + if (current == list->last) { + list->last = list_to_add->last; + } + list_to_add->last->next = next; + next = current->next; + } else { // current extracted + prev->next = list_to_add->First(); + if (ex_current_was_last) { + list->last = list_to_add->last; + ex_current_was_last = false; + } + list_to_add->last->next = next; + next = prev->next; + } + } + list_to_add->last = nullptr; + } + } // stay at current + + /*********************************************************************** + * Iterator::add_list_before + * + * Insert another list to this list before the current element. Move the + * iterator to the start of the inserted elements + * iterator. + **********************************************************************/ + void add_list_before( // add a list & + ConsList *list_to_add) { + if (!list_to_add->empty()) { + if (list->empty()) { + list->last = list_to_add->last; + prev = list->last; + current = list->First(); + next = current->next; + ex_current_was_last = false; + } else { + prev->next = list_to_add->First(); + if (current) { // not extracted + list_to_add->last->next = current; + } else { // current extracted + list_to_add->last->next = next; + if (ex_current_was_last) { + list->last = list_to_add->last; + } + if (ex_current_was_cycle_pt) { + cycle_pt = prev->next; + } + } + current = prev->next; + next = current->next; + } + list_to_add->last = nullptr; + } + } // move to it 1st item - void add_to_end( // add at end & - void *new_data); // don't move + T *data() { // get current data +#ifndef NDEBUG + if (!list) { + NO_LIST.error("Iterator::data", ABORT); + } +#endif + return current->data; + } - void exchange( // positions of 2 links - CLIST_ITERATOR *other_it); // other iterator + /*********************************************************************** + * Iterator::data_relative + * + * Return the data pointer to the element "offset" elements from current. + * "offset" must not be less than -1. + * (This function can't be INLINEd because it contains a loop) + **********************************************************************/ + T *data_relative( // get data + or - ... + int8_t offset) { // offset from current + Link *ptr; - int32_t length() const; //# elements in list +#ifndef NDEBUG + if (!list) + NO_LIST.error("Iterator::data_relative", ABORT); + if (list->empty()) + EMPTY_LIST.error("Iterator::data_relative", ABORT); + if (offset < -1) + BAD_PARAMETER.error("Iterator::data_relative", ABORT, "offset < -l"); +#endif - void sort( // sort elements - int comparator( // comparison routine - const void *, const void *)); -}; + if (offset == -1) { + ptr = prev; + } else { + for (ptr = current ? current : prev; offset-- > 0; ptr = ptr->next) { + ; + } + } -/*********************************************************************** - * CLIST_ITERATOR::set_to_list - * - * (Re-)initialise the iterator to point to the start of the list_to_iterate - * over. - **********************************************************************/ + return ptr->data; + } -inline void CLIST_ITERATOR::set_to_list( // change list - CLIST *list_to_iterate) { - list = list_to_iterate; - prev = list->last; - current = list->First(); - next = current != nullptr ? current->next : nullptr; - cycle_pt = nullptr; // await explicit set - started_cycling = false; - ex_current_was_last = false; - ex_current_was_cycle_pt = false; -} - -/*********************************************************************** - * CLIST_ITERATOR::CLIST_ITERATOR - * - * CONSTRUCTOR - set iterator to specified list; - **********************************************************************/ + /*********************************************************************** + * Iterator::forward + * + * Move the iterator to the next element of the list. + * REMEMBER: ALL LISTS ARE CIRCULAR. + **********************************************************************/ + T *forward() { + if (list->empty()) { + return nullptr; + } -inline CLIST_ITERATOR::CLIST_ITERATOR(CLIST *list_to_iterate) { - set_to_list(list_to_iterate); -} + if (current) { // not removed so + // set previous + prev = current; + started_cycling = true; + // In case next is deleted by another iterator, get next from current. + current = current->next; + } else { + if (ex_current_was_cycle_pt) { + cycle_pt = next; + } + current = next; + } -/*********************************************************************** - * CLIST_ITERATOR::add_after_then_move - * - * Add a new element to the list after the current element and move the - * iterator to the new element. - **********************************************************************/ + next = current->next; + return current->data; + } -inline void CLIST_ITERATOR::add_after_then_move( // element to add - void *new_data) { + /*********************************************************************** + * Iterator::extract + * + * Do extraction by removing current from the list, deleting the cons cell + * and returning the data to the caller, but NOT updating the iterator. (So + * that any calling loop can do this.) The iterator's current points to + * nullptr. If the data is to be deleted, this is the callers responsibility. + **********************************************************************/ + T *extract() { #ifndef NDEBUG - if (!new_data) { - BAD_PARAMETER.abort("CLIST_ITERATOR::add_after_then_move", "new_data is nullptr"); - } + if (!current) { // list empty or + // element extracted + NULL_CURRENT.error("Iterator::extract", ABORT); + } #endif - auto new_element = new CLIST_LINK; - new_element->data = new_data; - - if (list->empty()) { - new_element->next = new_element; - list->last = new_element; - prev = next = new_element; - } else { - new_element->next = next; + if (list->singleton()) { + // Special case where we do need to change the iterator. + prev = next = list->last = nullptr; + } else { + prev->next = next; // remove from list - if (current) { // not extracted - current->next = new_element; - prev = current; - if (current == list->last) { - list->last = new_element; + if (current == list->last) { + list->last = prev; + ex_current_was_last = true; + } else { + ex_current_was_last = false; + } } - } else { // current extracted - prev->next = new_element; - if (ex_current_was_last) { - list->last = new_element; + // Always set ex_current_was_cycle_pt so an add/forward will work in a loop. + ex_current_was_cycle_pt = (current == cycle_pt); + auto extracted_data = current->data; + delete (current); // destroy CONS cell + current = nullptr; + return extracted_data; + } // remove from list + + /*********************************************************************** + * Iterator::move_to_first() + * + * Move current so that it is set to the start of the list. + * Return data just in case anyone wants it. + **********************************************************************/ + T *move_to_first() { + current = list->First(); + prev = list->last; + next = current != nullptr ? current->next : nullptr; + return current != nullptr ? current->data : nullptr; + } // go to start of list + + /*********************************************************************** + * Iterator::move_to_last() + * + * Move current so that it is set to the end of the list. + * Return data just in case anyone wants it. + * (This function can't be INLINEd because it contains a loop) + **********************************************************************/ + T *move_to_last() { + while (current != list->last) { + forward(); } - if (ex_current_was_cycle_pt) { - cycle_pt = new_element; + + if (current == nullptr) { + return nullptr; + } else { + return current->data; } } - } - current = new_element; -} -/*********************************************************************** - * CLIST_ITERATOR::add_after_stay_put - * - * Add a new element to the list after the current element but do not move - * the iterator to the new element. - **********************************************************************/ - -inline void CLIST_ITERATOR::add_after_stay_put( // element to add - void *new_data) { + /*********************************************************************** + * Iterator::mark_cycle_pt() + * + * Remember the current location so that we can tell whether we've returned + * to this point later. + * + * If the current point is deleted either now, or in the future, the cycle + * point will be set to the next item which is set to current. This could be + * by a forward, add_after_then_move or add_after_then_move. + **********************************************************************/ + void mark_cycle_pt() { #ifndef NDEBUG - if (!new_data) { - BAD_PARAMETER.abort("CLIST_ITERATOR::add_after_stay_put", "new_data is nullptr"); - } + if (!list) { + NO_LIST.error("Iterator::mark_cycle_pt", ABORT); + } #endif - auto new_element = new CLIST_LINK; - new_element->data = new_data; - - if (list->empty()) { - new_element->next = new_element; - list->last = new_element; - prev = next = new_element; - ex_current_was_last = false; - current = nullptr; - } else { - new_element->next = next; - - if (current) { // not extracted - current->next = new_element; - if (prev == current) { - prev = new_element; - } - if (current == list->last) { - list->last = new_element; - } - } else { // current extracted - prev->next = new_element; - if (ex_current_was_last) { - list->last = new_element; - ex_current_was_last = false; + if (current) { + cycle_pt = current; + } else { + ex_current_was_cycle_pt = true; } + started_cycling = false; + } // remember current + + bool empty() const { // is list empty? + return list->empty(); } - next = new_element; - } -} -/*********************************************************************** - * CLIST_ITERATOR::add_before_then_move - * - * Add a new element to the list before the current element and move the - * iterator to the new element. - **********************************************************************/ + bool current_extracted() const { // current extracted? + return !current; + } + + /*********************************************************************** + * Iterator::at_first() + * + * Are we at the start of the list? + * + **********************************************************************/ + bool at_first() const { + // we're at a deleted + return ((list->empty()) || (current == list->First()) || + ((current == nullptr) && (prev == list->last) && // NON-last pt between + !ex_current_was_last)); // first and last + } // Current is first? + + /*********************************************************************** + * Iterator::at_last() + * + * Are we at the end of the list? + * + **********************************************************************/ + bool at_last() const { + // we're at a deleted + return ((list->empty()) || (current == list->last) || + ((current == nullptr) && (prev == list->last) && // last point between + ex_current_was_last)); // first and last + } // Current is last? + + /*********************************************************************** + * Iterator::cycled_list() + * + * Have we returned to the cycle_pt since it was set? + * + **********************************************************************/ + bool cycled_list() const { // Completed a cycle? + return ((list->empty()) || ((current == cycle_pt) && started_cycling)); + } -inline void CLIST_ITERATOR::add_before_then_move( // element to add - void *new_data) { + /*********************************************************************** + * Iterator::add_to_end + * + * Add a new element to the end of the list without moving the iterator. + * This is provided because a single linked list cannot move to the last as + * the iterator couldn't set its prev pointer. Adding to the end is + * essential for implementing + queues. + **********************************************************************/ + void add_to_end( // element to add + T *new_data) { #ifndef NDEBUG - if (!new_data) { - BAD_PARAMETER.abort("CLIST_ITERATOR::add_before_then_move", "new_data is nullptr"); - } + if (!list) { + NO_LIST.error("Iterator::add_to_end", ABORT); + } + if (!new_data) { + BAD_PARAMETER.error("Iterator::add_to_end", ABORT, "new_data is nullptr"); + } #endif - auto new_element = new CLIST_LINK; - new_element->data = new_data; - - if (list->empty()) { - new_element->next = new_element; - list->last = new_element; - prev = next = new_element; - } else { - prev->next = new_element; - if (current) { // not extracted - new_element->next = current; - next = current; - } else { // current extracted - new_element->next = next; - if (ex_current_was_last) { - list->last = new_element; - } - if (ex_current_was_cycle_pt) { - cycle_pt = new_element; + if (this->at_last()) { + this->add_after_stay_put(new_data); + } else { + if (this->at_first()) { + this->add_before_stay_put(new_data); + list->last = prev; + } else { // Iteratr is elsewhere + auto new_element = new Link; + new_element->data = new_data; + + new_element->next = list->last->next; + list->last->next = new_element; + list->last = new_element; + } } } - } - current = new_element; -} -/*********************************************************************** - * CLIST_ITERATOR::add_before_stay_put - * - * Add a new element to the list before the current element but don't move the - * iterator to the new element. - **********************************************************************/ + /*********************************************************************** + * Iterator::exchange() + * + * Given another iterator, whose current element is a different element on + * the same list list OR an element of another list, exchange the two current + * elements. On return, each iterator points to the element which was the + * other iterators current on entry. + * (This function hasn't been in-lined because its a bit big!) + **********************************************************************/ + void exchange( // positions of 2 links + Iterator *other_it) { // other iterator + constexpr ERRCODE DONT_EXCHANGE_DELETED("Can't exchange deleted elements of lists"); + + /* Do nothing if either list is empty or if both iterators reference the same + link */ + + if ((list->empty()) || (other_it->list->empty()) || (current == other_it->current)) { + return; + } -inline void CLIST_ITERATOR::add_before_stay_put( // element to add - void *new_data) { -#ifndef NDEBUG - if (!new_data) { - BAD_PARAMETER.abort("CLIST_ITERATOR::add_before_stay_put", "new_data is nullptr"); - } -#endif + /* Error if either current element is deleted */ - auto new_element = new CLIST_LINK; - new_element->data = new_data; - - if (list->empty()) { - new_element->next = new_element; - list->last = new_element; - prev = next = new_element; - ex_current_was_last = true; - current = nullptr; - } else { - prev->next = new_element; - if (current) { // not extracted - new_element->next = current; - if (next == current) { - next = new_element; - } - } else { // current extracted - new_element->next = next; - if (ex_current_was_last) { - list->last = new_element; + if (!current || !other_it->current) { + DONT_EXCHANGE_DELETED.error("Iterator.exchange", ABORT); } - } - prev = new_element; - } -} - -/*********************************************************************** - * CLIST_ITERATOR::add_list_after - * - * Insert another list to this list after the current element but don't move - *the - * iterator. - **********************************************************************/ -inline void CLIST_ITERATOR::add_list_after(CLIST *list_to_add) { - if (!list_to_add->empty()) { - if (list->empty()) { - list->last = list_to_add->last; - prev = list->last; - next = list->First(); - ex_current_was_last = true; - current = nullptr; - } else { - if (current) { // not extracted - current->next = list_to_add->First(); - if (current == list->last) { - list->last = list_to_add->last; + /* Now handle the 4 cases: doubleton list; non-doubleton adjacent elements + (other before this); non-doubleton adjacent elements (this before other); + non-adjacent elements. */ + + // adjacent links + if ((next == other_it->current) || (other_it->next == current)) { + // doubleton list + if ((next == other_it->current) && (other_it->next == current)) { + prev = next = current; + other_it->prev = other_it->next = other_it->current; + } else { // non-doubleton with + // adjacent links + // other before this + if (other_it->next == current) { + other_it->prev->next = current; + other_it->current->next = next; + current->next = other_it->current; + other_it->next = other_it->current; + prev = current; + } else { // this before other + prev->next = other_it->current; + current->next = other_it->next; + other_it->current->next = current; + next = current; + other_it->prev = other_it->current; + } } - list_to_add->last->next = next; - next = current->next; - } else { // current extracted - prev->next = list_to_add->First(); - if (ex_current_was_last) { - list->last = list_to_add->last; - ex_current_was_last = false; - } - list_to_add->last->next = next; - next = prev->next; + } else { // no overlap + prev->next = other_it->current; + current->next = other_it->next; + other_it->prev->next = current; + other_it->current->next = next; } - } - list_to_add->last = nullptr; - } -} -/*********************************************************************** - * CLIST_ITERATOR::add_list_before - * - * Insert another list to this list before the current element. Move the - * iterator to the start of the inserted elements - * iterator. - **********************************************************************/ + /* update end of list pointer when necessary (remember that the 2 iterators + may iterate over different lists!) */ -inline void CLIST_ITERATOR::add_list_before(CLIST *list_to_add) { - if (!list_to_add->empty()) { - if (list->empty()) { - list->last = list_to_add->last; - prev = list->last; - current = list->First(); - next = current->next; - ex_current_was_last = false; - } else { - prev->next = list_to_add->First(); - if (current) { // not extracted - list_to_add->last->next = current; - } else { // current extracted - list_to_add->last->next = next; - if (ex_current_was_last) { - list->last = list_to_add->last; - } - if (ex_current_was_cycle_pt) { - cycle_pt = prev->next; - } + if (list->last == current) { + list->last = other_it->current; + } + if (other_it->list->last == other_it->current) { + other_it->list->last = current; } - current = prev->next; - next = current->next; - } - list_to_add->last = nullptr; - } -} -/*********************************************************************** - * CLIST_ITERATOR::extract - * - * Do extraction by removing current from the list, deleting the cons cell - * and returning the data to the caller, but NOT updating the iterator. (So - * that any calling loop can do this.) The iterator's current points to - * nullptr. If the data is to be deleted, this is the callers responsibility. - **********************************************************************/ + if (current == cycle_pt) { + cycle_pt = other_it->cycle_pt; + } + if (other_it->current == other_it->cycle_pt) { + other_it->cycle_pt = cycle_pt; + } -inline void *CLIST_ITERATOR::extract() { -#ifndef NDEBUG - if (!current) { // list empty or - // element extracted - NULL_CURRENT.abort("CLIST_ITERATOR::extract"); - } -#endif + /* The actual exchange - in all cases*/ - if (list->singleton()) { - // Special case where we do need to change the iterator. - prev = next = list->last = nullptr; - } else { - prev->next = next; // remove from list + auto old_current = current; + current = other_it->current; + other_it->current = old_current; + } - if (current == list->last) { - list->last = prev; - ex_current_was_last = true; - } else { - ex_current_was_last = false; + /*********************************************************************** + * Iterator::length() + * + * Return the length of the list + * + **********************************************************************/ + int32_t length() const { + return list->length(); } - } - // Always set ex_current_was_cycle_pt so an add/forward will work in a loop. - ex_current_was_cycle_pt = (current == cycle_pt); - auto extracted_data = current->data; - delete (current); // destroy CONS cell - current = nullptr; - return extracted_data; -} - -/*********************************************************************** - * CLIST_ITERATOR::move_to_first() - * - * Move current so that it is set to the start of the list. - * Return data just in case anyone wants it. - **********************************************************************/ -inline void *CLIST_ITERATOR::move_to_first() { - current = list->First(); - prev = list->last; - next = current != nullptr ? current->next : nullptr; - return current != nullptr ? current->data : nullptr; -} + /*********************************************************************** + * Iterator::sort() + * + * Sort the elements of the list, then reposition at the start. + * + **********************************************************************/ + void sort( // sort elements + int comparator( // comparison routine + const T *, const T *)) { + list->sort(comparator); + move_to_first(); + } + }; + using ITERATOR = Iterator; // compat -/*********************************************************************** - * CLIST_ITERATOR::mark_cycle_pt() - * - * Remember the current location so that we can tell whether we've returned - * to this point later. - * - * If the current point is deleted either now, or in the future, the cycle - * point will be set to the next item which is set to current. This could be - * by a forward, add_after_then_move or add_after_then_move. - **********************************************************************/ +private: + Link *last = nullptr; // End of list -inline void CLIST_ITERATOR::mark_cycle_pt() { -#ifndef NDEBUG - if (!list) { - NO_LIST.abort("CLIST_ITERATOR::mark_cycle_pt"); + //(Points to head) + Link *First() { // return first + return last != nullptr ? last->next : nullptr; } -#endif - if (current) { - cycle_pt = current; - } else { - ex_current_was_cycle_pt = true; + const Link *First() const { // return first + return last != nullptr ? last->next : nullptr; } - started_cycling = false; -} -/*********************************************************************** - * CLIST_ITERATOR::at_first() - * - * Are we at the start of the list? - * - **********************************************************************/ +public: + ~ConsList() { // destructor + shallow_clear(); + } -inline bool CLIST_ITERATOR::at_first() const { - // we're at a deleted - return ((list->empty()) || (current == list->First()) || - ((current == nullptr) && (prev == list->last) && // NON-last pt between - !ex_current_was_last)); // first and last -} + /*********************************************************************** + * ConsList::internal_deep_clear + * + * Used by the "deep_clear" member function of derived list + * classes to destroy all the elements on the list. + * The calling function passes a "zapper" function which can be called to + * delete each data element of the list, regardless of its class. This + * technique permits a generic clear function to destroy elements of + * different derived types correctly, without requiring virtual functions and + * the consequential memory overhead. + **********************************************************************/ + void internal_deep_clear() { // ptr to zapper functn + if (!empty()) { + auto ptr = last->next; // set to first + last->next = nullptr; // break circle + last = nullptr; // set list empty + while (ptr) { + auto next = ptr->next; + delete ptr->data; + delete (ptr); + ptr = next; + } + } + } + void deep_clear() { + internal_deep_clear(); + } -/*********************************************************************** - * CLIST_ITERATOR::at_last() - * - * Are we at the end of the list? - * - **********************************************************************/ + /*********************************************************************** + * ConsList::shallow_clear + * + * Used by the destructor and the "shallow_clear" member function of derived + * list classes to destroy the list. + * The data elements are NOT destroyed. + * + **********************************************************************/ + void shallow_clear() { // destroy all links + if (!empty()) { + auto ptr = last->next; // set to first + last->next = nullptr; // break circle + last = nullptr; // set list empty + while (ptr) { + auto next = ptr->next; + delete (ptr); + ptr = next; + } + } + } -inline bool CLIST_ITERATOR::at_last() const { - // we're at a deleted - return ((list->empty()) || (current == list->last) || - ((current == nullptr) && (prev == list->last) && // last point between - ex_current_was_last)); // first and last -} + bool empty() const { // is list empty? + return !last; + } -/*********************************************************************** - * CLIST_ITERATOR::cycled_list() - * - * Have we returned to the cycle_pt since it was set? - * - **********************************************************************/ + bool singleton() const { + return last != nullptr ? (last == last->next) : false; + } -inline bool CLIST_ITERATOR::cycled_list() const { - return ((list->empty()) || ((current == cycle_pt) && started_cycling)); -} + void shallow_copy( // dangerous!! + ConsList *from_list) { // beware destructors!! + last = from_list->last; + } -/*********************************************************************** - * CLIST_ITERATOR::length() - * - * Return the length of the list - * - **********************************************************************/ + /*********************************************************************** + * ConsList::assign_to_sublist + * + * The list is set to a sublist of another list. "This" list must be empty + * before this function is invoked. The two iterators passed must refer to + * the same list, different from "this" one. The sublist removed is the + * inclusive list from start_it's current position to end_it's current + * position. If this range passes over the end of the source list then the + * source list has its end set to the previous element of start_it. The + * extracted sublist is unaffected by the end point of the source list, its + * end point is always the end_it position. + **********************************************************************/ + void assign_to_sublist( // to this list + Iterator *start_it, // from list start + Iterator *end_it) { // from list end + constexpr ERRCODE LIST_NOT_EMPTY("Destination list must be empty before extracting a sublist"); + + if (!empty()) { + LIST_NOT_EMPTY.error("ConsList.assign_to_sublist", ABORT); + } -inline int32_t CLIST_ITERATOR::length() const { - return list->length(); -} + last = start_it->extract_sublist(end_it); + } -/*********************************************************************** - * CLIST_ITERATOR::sort() - * - * Sort the elements of the list, then reposition at the start. - * - **********************************************************************/ + int32_t length() const { //# elements in list + int32_t count = 0; + if (last != nullptr) { + count = 1; + for (auto it = last->next; it != last; it = it->next) { + count++; + } + } + return count; + } -inline void CLIST_ITERATOR::sort( // sort elements - int comparator( // comparison routine - const void *, const void *)) { - list->sort(comparator); - move_to_first(); -} + /*********************************************************************** + * ConsList::sort + * + * Sort elements on list + **********************************************************************/ + void sort( // sort elements + int comparator( // comparison routine + const T *, const T *)) { + // Allocate an array of pointers, one per list element. + auto count = length(); + if (count > 0) { + // ptr array to sort + std::vector base; + base.reserve(count); + + Iterator it(this); + + // Extract all elements, putting the pointers in the array. + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + base.push_back(it.extract()); + } -/*********************************************************************** - * CLIST_ITERATOR::add_to_end - * - * Add a new element to the end of the list without moving the iterator. - * This is provided because a single linked list cannot move to the last as - * the iterator couldn't set its prev pointer. Adding to the end is - * essential for implementing - queues. -**********************************************************************/ - -inline void CLIST_ITERATOR::add_to_end( // element to add - void *new_data) { -#ifndef NDEBUG - if (!list) { - NO_LIST.abort("CLIST_ITERATOR::add_to_end"); - } - if (!new_data) { - BAD_PARAMETER.abort("CLIST_ITERATOR::add_to_end", "new_data is nullptr"); + // Sort the pointer array. + std::sort(base.begin(), base.end(), + // all current comparators return -1,0,1, so we handle this correctly for std::sort + [&](auto &&l, auto &&r) {return comparator(l, r) < 0; }); + + // Rebuild the list from the sorted pointers. + for (auto current : base) { + it.add_to_end(current); + } + } } -#endif - if (this->at_last()) { - this->add_after_stay_put(new_data); - } else { - if (this->at_first()) { - this->add_before_stay_put(new_data); - list->last = prev; - } else { // Iteratr is elsewhere - auto new_element = new CLIST_LINK; + // Assuming list has been sorted already, insert new_data to + // keep the list sorted according to the same comparison function. + // Comparison function is the same as used by sort, i.e. uses double + // indirection. Time is O(1) to add to beginning or end. + // Time is linear to add pre-sorted items to an empty list. + // If unique, then don't add duplicate entries. + // Returns true if the element was added to the list. + bool add_sorted(int comparator(const T *, const T *), bool unique, T *new_data) { + // Check for adding at the end. + if (last == nullptr || comparator(last->data, new_data) < 0) { + auto *new_element = new Link; new_element->data = new_data; - - new_element->next = list->last->next; - list->last->next = new_element; - list->last = new_element; + if (last == nullptr) { + new_element->next = new_element; + } else { + new_element->next = last->next; + last->next = new_element; + } + last = new_element; + return true; + } else if (!unique || last->data != new_data) { + // Need to use an iterator. + Iterator it(this); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + auto data = it.data(); + if (data == new_data && unique) { + return false; + } + if (comparator(data, new_data) > 0) { + break; + } + } + if (it.cycled_list()) { + it.add_to_end(new_data); + } else { + it.add_before_then_move(new_data); + } + return true; } + return false; } -} - -template -class X_CLIST : public CLIST { -public: - X_CLIST() = default; - X_CLIST(const X_CLIST &) = delete; - X_CLIST &operator=(const X_CLIST &) = delete; - void deep_clear() { - internal_deep_clear([](void *link) {delete static_cast(link);}); + // Assuming that the minuend and subtrahend are already sorted with + // the same comparison function, shallow clears this and then copies + // the set difference minuend - subtrahend to this, being the elements + // of minuend that do not compare equal to anything in subtrahend. + // If unique is true, any duplicates in minuend are also eliminated. + void set_subtract(int comparator(const T *, const T *), bool unique, ConsList *minuend, + ConsList *subtrahend) { + shallow_clear(); + Iterator m_it(minuend); + Iterator s_it(subtrahend); + // Since both lists are sorted, finding the subtras that are not + // minus is a case of a parallel iteration. + for (m_it.mark_cycle_pt(); !m_it.cycled_list(); m_it.forward()) { + auto minu = m_it.data(); + T *subtra = nullptr; + if (!s_it.empty()) { + subtra = s_it.data(); + while (!s_it.at_last() && comparator(subtra, minu) < 0) { + s_it.forward(); + subtra = s_it.data(); + } + } + if (subtra == nullptr || comparator(subtra, minu) != 0) { + add_sorted(comparator, unique, minu); + } + } } }; -#define CLISTIZEH(CLASSNAME) \ +#define CLISTIZEH(T) \ class CLASSNAME##_CLIST; \ struct CLASSNAME##_C_IT; \ \ - class CLASSNAME##_CLIST : public X_CLIST { \ - using X_CLIST::X_CLIST; \ - }; \ - struct CLASSNAME##_C_IT : X_ITER { \ - using X_ITER::X_ITER; \ - } + class T##_CLIST : public ConsList { \ + using ConsList::ConsList; \ + }; \ + using T##_C_IT = ConsList::Iterator } // namespace tesseract diff --git a/src/ccutil/elst.cpp b/src/ccutil/elst.cpp deleted file mode 100644 index 068327f581..0000000000 --- a/src/ccutil/elst.cpp +++ /dev/null @@ -1,446 +0,0 @@ -/********************************************************************** - * File: elst.cpp (Formerly elist.c) - * Description: Embedded list handling code which is not in the include file. - * Author: Phil Cheatle - * - * (C) Copyright 1991, Hewlett-Packard Ltd. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include // compiler config, etc. - -#include "elst.h" -#include - -namespace tesseract { - -/*********************************************************************** - * ELIST::internal_clear - * - * Used by the destructor and the "clear" member function of derived list - * classes to destroy all the elements on the list. - * The calling function passes a "zapper" function which can be called to - * delete each element of the list, regardless of its derived type. This - * technique permits a generic clear function to destroy elements of - * different derived types correctly, without requiring virtual functions and - * the consequential memory overhead. - **********************************************************************/ - -void ELIST::internal_clear( // destroy all links - void (*zapper)(void *)) { - // ptr to zapper functn - ELIST_LINK *ptr; - ELIST_LINK *next; - - if (!empty()) { - ptr = last->next; // set to first - last->next = nullptr; // break circle - last = nullptr; // set list empty - while (ptr) { - next = ptr->next; - zapper(ptr); - ptr = next; - } - } -} - -/*********************************************************************** - * ELIST::assign_to_sublist - * - * The list is set to a sublist of another list. "This" list must be empty - * before this function is invoked. The two iterators passed must refer to - * the same list, different from "this" one. The sublist removed is the - * inclusive list from start_it's current position to end_it's current - * position. If this range passes over the end of the source list then the - * source list has its end set to the previous element of start_it. The - * extracted sublist is unaffected by the end point of the source list, its - * end point is always the end_it position. - **********************************************************************/ - -void ELIST::assign_to_sublist( // to this list - ELIST_ITERATOR *start_it, // from list start - ELIST_ITERATOR *end_it) { // from list end - constexpr ERRCODE LIST_NOT_EMPTY("Destination list must be empty before extracting a sublist"); - - if (!empty()) { - LIST_NOT_EMPTY.abort("ELIST.assign_to_sublist"); - } - - last = start_it->extract_sublist(end_it); -} - -/*********************************************************************** - * ELIST::sort - * - * Sort elements on list - * NB If you don't like the const declarations in the comparator, coerce yours: - * ( int (*)(const void *, const void *) - **********************************************************************/ - -void ELIST::sort( // sort elements - int comparator( // comparison routine - const void *, const void *)) { - // Allocate an array of pointers, one per list element. - auto count = length(); - - if (count > 0) { - // ptr array to sort - std::vector base; - base.reserve(count); - - ELIST_ITERATOR it(this); - - // Extract all elements, putting the pointers in the array. - for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { - base.push_back(it.extract()); - } - - // Sort the pointer array. - qsort(&base[0], count, sizeof(base[0]), comparator); - - // Rebuild the list from the sorted pointers. - for (auto current : base) { - it.add_to_end(current); - } - } -} - -// Assuming list has been sorted already, insert new_link to -// keep the list sorted according to the same comparison function. -// Comparison function is the same as used by sort, i.e. uses double -// indirection. Time is O(1) to add to beginning or end. -// Time is linear to add pre-sorted items to an empty list. -// If unique is set to true and comparator() returns 0 (an entry with the -// same information as the one contained in new_link is already in the -// list) - new_link is not added to the list and the function returns the -// pointer to the identical entry that already exists in the list -// (otherwise the function returns new_link). -ELIST_LINK *ELIST::add_sorted_and_find(int comparator(const void *, const void *), bool unique, - ELIST_LINK *new_link) { - // Check for adding at the end. - if (last == nullptr || comparator(&last, &new_link) < 0) { - if (last == nullptr) { - new_link->next = new_link; - } else { - new_link->next = last->next; - last->next = new_link; - } - last = new_link; - } else { - // Need to use an iterator. - ELIST_ITERATOR it(this); - for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { - ELIST_LINK *link = it.data(); - int compare = comparator(&link, &new_link); - if (compare > 0) { - break; - } else if (unique && compare == 0) { - return link; - } - } - if (it.cycled_list()) { - it.add_to_end(new_link); - } else { - it.add_before_then_move(new_link); - } - } - return new_link; -} - -/*********************************************************************** - * MEMBER FUNCTIONS OF CLASS: ELIST_ITERATOR - * ========================================= - **********************************************************************/ - -/*********************************************************************** - * ELIST_ITERATOR::forward - * - * Move the iterator to the next element of the list. - * REMEMBER: ALL LISTS ARE CIRCULAR. - **********************************************************************/ - -ELIST_LINK *ELIST_ITERATOR::forward() { -#ifndef NDEBUG - if (!list) - NO_LIST.abort("ELIST_ITERATOR::forward"); -#endif - if (list->empty()) { - return nullptr; - } - - if (current) { // not removed so - // set previous - prev = current; - started_cycling = true; - // In case next is deleted by another iterator, get next from current. - current = current->next; - } else { - if (ex_current_was_cycle_pt) { - cycle_pt = next; - } - current = next; - } -#ifndef NDEBUG - if (!current) - NULL_DATA.abort("ELIST_ITERATOR::forward"); -#endif - next = current->next; - -#ifndef NDEBUG - if (!next) { - NULL_NEXT.abort("ELIST_ITERATOR::forward", - "This is: %p Current is: %p", - static_cast(this), - static_cast(current)); - } -#endif - return current; -} - -/*********************************************************************** - * ELIST_ITERATOR::data_relative - * - * Return the data pointer to the element "offset" elements from current. - * "offset" must not be less than -1. - * (This function can't be INLINEd because it contains a loop) - **********************************************************************/ - -ELIST_LINK *ELIST_ITERATOR::data_relative( // get data + or - ... - int8_t offset) { // offset from current - ELIST_LINK *ptr; - -#ifndef NDEBUG - if (!list) - NO_LIST.abort("ELIST_ITERATOR::data_relative"); - if (list->empty()) - EMPTY_LIST.abort("ELIST_ITERATOR::data_relative"); - if (offset < -1) - BAD_PARAMETER.abort("ELIST_ITERATOR::data_relative", "offset < -l"); -#endif - - if (offset == -1) { - ptr = prev; - } else { - for (ptr = current ? current : prev; offset-- > 0; ptr = ptr->next) { -#ifndef NDEBUG - if (!ptr) - BAD_PARAMETER.error("CLIST_ITERATOR::data_relative", ABORT, "ptr == nullptr"); -#endif - ; - } - } - -#ifndef NDEBUG - if (!ptr) - NULL_DATA.abort("ELIST_ITERATOR::data_relative"); -#endif - - return ptr; -} - -/*********************************************************************** - * ELIST_ITERATOR::move_to_last() - * - * Move current so that it is set to the end of the list. - * Return data just in case anyone wants it. - * (This function can't be INLINEd because it contains a loop) - **********************************************************************/ - -ELIST_LINK *ELIST_ITERATOR::move_to_last() { -#ifndef NDEBUG - if (!list) - NO_LIST.abort("ELIST_ITERATOR::move_to_last"); -#endif - - while (current != list->last) { - forward(); - } - - return current; -} - -/*********************************************************************** - * ELIST_ITERATOR::exchange() - * - * Given another iterator, whose current element is a different element on - * the same list list OR an element of another list, exchange the two current - * elements. On return, each iterator points to the element which was the - * other iterators current on entry. - * (This function hasn't been in-lined because its a bit big!) - **********************************************************************/ - -void ELIST_ITERATOR::exchange( // positions of 2 links - ELIST_ITERATOR *other_it) { // other iterator - constexpr ERRCODE DONT_EXCHANGE_DELETED("Can't exchange deleted elements of lists"); - - ELIST_LINK *old_current; - -#ifndef NDEBUG - if (!list) - NO_LIST.abort("ELIST_ITERATOR::exchange"); - if (!other_it) - BAD_PARAMETER.abort("ELIST_ITERATOR::exchange", "other_it nullptr"); - if (!(other_it->list)) - NO_LIST.abort("ELIST_ITERATOR::exchange", "other_it"); -#endif - - /* Do nothing if either list is empty or if both iterators reference the same -link */ - - if ((list->empty()) || (other_it->list->empty()) || (current == other_it->current)) { - return; - } - - /* Error if either current element is deleted */ - - if (!current || !other_it->current) { - DONT_EXCHANGE_DELETED.abort("ELIST_ITERATOR.exchange"); - } - - /* Now handle the 4 cases: doubleton list; non-doubleton adjacent elements -(other before this); non-doubleton adjacent elements (this before other); -non-adjacent elements. */ - - // adjacent links - if ((next == other_it->current) || (other_it->next == current)) { - // doubleton list - if ((next == other_it->current) && (other_it->next == current)) { - prev = next = current; - other_it->prev = other_it->next = other_it->current; - } else { // non-doubleton with - // adjacent links - // other before this - if (other_it->next == current) { - other_it->prev->next = current; - other_it->current->next = next; - current->next = other_it->current; - other_it->next = other_it->current; - prev = current; - } else { // this before other - prev->next = other_it->current; - current->next = other_it->next; - other_it->current->next = current; - next = current; - other_it->prev = other_it->current; - } - } - } else { // no overlap - prev->next = other_it->current; - current->next = other_it->next; - other_it->prev->next = current; - other_it->current->next = next; - } - - /* update end of list pointer when necessary (remember that the 2 iterators - may iterate over different lists!) */ - - if (list->last == current) { - list->last = other_it->current; - } - if (other_it->list->last == other_it->current) { - other_it->list->last = current; - } - - if (current == cycle_pt) { - cycle_pt = other_it->cycle_pt; - } - if (other_it->current == other_it->cycle_pt) { - other_it->cycle_pt = cycle_pt; - } - - /* The actual exchange - in all cases*/ - - old_current = current; - current = other_it->current; - other_it->current = old_current; -} - -/*********************************************************************** - * ELIST_ITERATOR::extract_sublist() - * - * This is a private member, used only by ELIST::assign_to_sublist. - * Given another iterator for the same list, extract the links from THIS to - * OTHER inclusive, link them into a new circular list, and return a - * pointer to the last element. - * (Can't inline this function because it contains a loop) - **********************************************************************/ - -ELIST_LINK *ELIST_ITERATOR::extract_sublist( // from this current - ELIST_ITERATOR *other_it) { // to other current -#ifndef NDEBUG - constexpr ERRCODE BAD_EXTRACTION_PTS("Can't extract sublist from points on different lists"); - constexpr ERRCODE DONT_EXTRACT_DELETED("Can't extract a sublist marked by deleted points"); -#endif - constexpr ERRCODE BAD_SUBLIST("Can't find sublist end point in original list"); - - ELIST_ITERATOR temp_it = *this; - ELIST_LINK *end_of_new_list; - -#ifndef NDEBUG - if (!other_it) - BAD_PARAMETER.abort("ELIST_ITERATOR::extract_sublist", "other_it nullptr"); - if (!list) - NO_LIST.abort("ELIST_ITERATOR::extract_sublist"); - if (list != other_it->list) - BAD_EXTRACTION_PTS.abort("ELIST_ITERATOR.extract_sublist"); - if (list->empty()) - EMPTY_LIST.abort("ELIST_ITERATOR::extract_sublist"); - - if (!current || !other_it->current) - DONT_EXTRACT_DELETED.abort("ELIST_ITERATOR.extract_sublist"); -#endif - - ex_current_was_last = other_it->ex_current_was_last = false; - ex_current_was_cycle_pt = false; - other_it->ex_current_was_cycle_pt = false; - - temp_it.mark_cycle_pt(); - do { // walk sublist - if (temp_it.cycled_list()) { // can't find end pt - BAD_SUBLIST.abort("ELIST_ITERATOR.extract_sublist"); - } - - if (temp_it.at_last()) { - list->last = prev; - ex_current_was_last = other_it->ex_current_was_last = true; - } - - if (temp_it.current == cycle_pt) { - ex_current_was_cycle_pt = true; - } - - if (temp_it.current == other_it->cycle_pt) { - other_it->ex_current_was_cycle_pt = true; - } - - temp_it.forward(); - } while (temp_it.prev != other_it->current); - - // circularise sublist - other_it->current->next = current; - end_of_new_list = other_it->current; - - // sublist = whole list - if (prev == other_it->current) { - list->last = nullptr; - prev = current = next = nullptr; - other_it->prev = other_it->current = other_it->next = nullptr; - } else { - prev->next = other_it->next; - current = other_it->current = nullptr; - next = other_it->next; - other_it->prev = prev; - } - return end_of_new_list; -} - -} // namespace tesseract diff --git a/src/ccutil/elst.h b/src/ccutil/elst.h index dc21e76fb3..c7a8e1fc1a 100644 --- a/src/ccutil/elst.h +++ b/src/ccutil/elst.h @@ -19,16 +19,14 @@ #ifndef ELST_H #define ELST_H -#include "list.h" #include "lsterr.h" #include "serialis.h" +#include #include namespace tesseract { -class ELIST_ITERATOR; - /********************************************************************** This module implements list classes and iterators. The following list types and iterators are provided: @@ -68,755 +66,1073 @@ list class - though macros can generate these. It also prevents heterogeneous lists. **********************************************************************/ -/********************************************************************** - * CLASS - ELIST_LINK - * - * Generic link class for singly linked lists with - *embedded links - * - * Note: No destructor - elements are assumed to be destroyed EITHER after - * they have been extracted from a list OR by the ELIST destructor which - * walks the list. - **********************************************************************/ - -class ELIST_LINK { - friend class ELIST_ITERATOR; - friend class ELIST; - - ELIST_LINK *next; - -public: - ELIST_LINK() { - next = nullptr; - } - // constructor - - // The special copy constructor is used by lots of classes. - ELIST_LINK(const ELIST_LINK &) { - next = nullptr; - } - - // The special assignment operator is used by lots of classes. - void operator=(const ELIST_LINK &) { - next = nullptr; - } -}; - /********************************************************************** * CLASS - ELIST * * Generic list class for singly linked lists with embedded links **********************************************************************/ -class TESS_API ELIST { - friend class ELIST_ITERATOR; - - ELIST_LINK *last = nullptr; // End of list - - //(Points to head) - ELIST_LINK *First() { // return first - return last ? last->next : nullptr; - } - +template +class IntrusiveForwardList { public: - // destroy all links - void internal_clear(void (*zapper)(void *)); - - bool empty() const { - return !last; - } - - bool singleton() const { - return last ? (last == last->next) : false; - } - - void shallow_copy( // dangerous!! - ELIST *from_list) { // beware destructors!! - last = from_list->last; - } - - // ptr to copier functn - void internal_deep_copy(ELIST_LINK *(*copier)(ELIST_LINK *), - const ELIST *list); // list being copied - - void assign_to_sublist( // to this list - ELIST_ITERATOR *start_it, // from list start - ELIST_ITERATOR *end_it); // from list end - - // # elements in list - int32_t length() const { - int32_t count = 0; - if (last != nullptr) { - count = 1; - for (auto it = last->next; it != last; it = it->next) { - count++; - } + /********************************************************************** + * CLASS - ELIST_LINK + * + * Generic link class for singly linked lists with + *embedded links + * + * Note: No destructor - elements are assumed to be destroyed EITHER after + * they have been extracted from a list OR by the IntrusiveForwardList destructor which + * walks the list. + **********************************************************************/ + + class Link { + friend class Iterator; + friend class IntrusiveForwardList; + + T *next; + + public: + Link() { + next = nullptr; } - return count; - } + // constructor - void sort( // sort elements - int comparator( // comparison routine - const void *, const void *)); + // The special copy constructor is used by lots of classes. + Link(const Link &) { + next = nullptr; + } - // Assuming list has been sorted already, insert new_link to - // keep the list sorted according to the same comparison function. - // Comparison function is the same as used by sort, i.e. uses double - // indirection. Time is O(1) to add to beginning or end. - // Time is linear to add pre-sorted items to an empty list. - // If unique is set to true and comparator() returns 0 (an entry with the - // same information as the one contained in new_link is already in the - // list) - new_link is not added to the list and the function returns the - // pointer to the identical entry that already exists in the list - // (otherwise the function returns new_link). - ELIST_LINK *add_sorted_and_find(int comparator(const void *, const void *), bool unique, - ELIST_LINK *new_link); + // The special assignment operator is used by lots of classes. + void operator=(const Link &) { + next = nullptr; + } + }; + using LINK = Link; // compat - // Same as above, but returns true if the new entry was inserted, false - // if the identical entry already existed in the list. - bool add_sorted(int comparator(const void *, const void *), bool unique, ELIST_LINK *new_link) { - return (add_sorted_and_find(comparator, unique, new_link) == new_link); - } -}; -/*********************************************************************** - * CLASS - ELIST_ITERATOR - * - * Generic iterator class for singly linked lists with - *embedded links - **********************************************************************/ + /*********************************************************************** + * CLASS - ELIST_ITERATOR + * + * Generic iterator class for singly linked lists with + *embedded links + **********************************************************************/ -class TESS_API ELIST_ITERATOR { - friend void ELIST::assign_to_sublist(ELIST_ITERATOR *, ELIST_ITERATOR *); + class Iterator { + friend void IntrusiveForwardList::assign_to_sublist(Iterator *, Iterator *); // V730 Not all members of a class are initialized inside the constructor. Consider inspecting: prev, current, next, cycle_pt, ex_current_was_last, ex_current_was_cycle_pt, ... elst.h 204 - ELIST *list = nullptr; // List being iterated - ELIST_LINK *prev = nullptr; // prev element - ELIST_LINK *current = nullptr; // current element - ELIST_LINK *next = nullptr; // next element - ELIST_LINK *cycle_pt = nullptr; // point we are cycling the list to. + IntrusiveForwardList *list = nullptr; // List being iterated + T *prev = nullptr; // prev element + T *current = nullptr; // current element + T *next = nullptr; // next element + T *cycle_pt = nullptr; // point we are cycling the list to. bool ex_current_was_last = false; // current extracted was end of list bool ex_current_was_cycle_pt = false; // current extracted was cycle point bool started_cycling = false; // Have we moved off the start? - - ELIST_LINK *extract_sublist( // from this current... - ELIST_ITERATOR *other_it); // to other current - -public: - ELIST_ITERATOR() { // constructor - list = nullptr; - } // unassigned list - - explicit ELIST_ITERATOR(ELIST *list_to_iterate); - - void set_to_list( // change list - ELIST *list_to_iterate); - - void add_after_then_move( // add after current & - ELIST_LINK *new_link); // move to new - - void add_after_stay_put( // add after current & - ELIST_LINK *new_link); // stay at current - - void add_before_then_move( // add before current & - ELIST_LINK *new_link); // move to new - - void add_before_stay_put( // add before current & - ELIST_LINK *new_link); // stay at current - - void add_list_after( // add a list & - ELIST *list_to_add); // stay at current - - void add_list_before( // add a list & - ELIST *list_to_add); // move to it 1st item - - ELIST_LINK *data() { // get current data + + /*********************************************************************** + * Iterator::extract_sublist() + * + * This is a private member, used only by IntrusiveForwardList::assign_to_sublist. + * Given another iterator for the same list, extract the links from THIS to + * OTHER inclusive, link them into a new circular list, and return a + * pointer to the last element. + * (Can't inline this function because it contains a loop) + **********************************************************************/ + T *extract_sublist( // from this current... + Iterator *other_it) { // to other current #ifndef NDEBUG - if (!list) { - NO_LIST.abort("ELIST_ITERATOR::data"); - } - if (!current) { - NULL_DATA.abort("ELIST_ITERATOR::data"); - } + constexpr ERRCODE BAD_EXTRACTION_PTS("Can't extract sublist from points on different lists"); + constexpr ERRCODE DONT_EXTRACT_DELETED("Can't extract a sublist marked by deleted points"); #endif - return current; - } - - ELIST_LINK *data_relative( // get data + or - ... - int8_t offset); // offset from current - - ELIST_LINK *forward(); // move to next element - - ELIST_LINK *extract(); // remove from list + constexpr ERRCODE BAD_SUBLIST("Can't find sublist end point in original list"); - ELIST_LINK *move_to_first(); // go to start of list + Iterator temp_it = *this; + T *end_of_new_list; - ELIST_LINK *move_to_last(); // go to end of list - - void mark_cycle_pt(); // remember current - - bool empty() const { // is list empty? #ifndef NDEBUG - if (!list) { - NO_LIST.abort("ELIST_ITERATOR::empty"); - } + if (!other_it) + BAD_PARAMETER.error("ELIST_ITERATOR::extract_sublist", ABORT, "other_it nullptr"); + if (!list) + NO_LIST.error("ELIST_ITERATOR::extract_sublist", ABORT); + if (list != other_it->list) + BAD_EXTRACTION_PTS.error("ELIST_ITERATOR.extract_sublist", ABORT); + if (list->empty()) + EMPTY_LIST.error("ELIST_ITERATOR::extract_sublist", ABORT); + + if (!current || !other_it->current) + DONT_EXTRACT_DELETED.error("ELIST_ITERATOR.extract_sublist", ABORT); #endif - return list->empty(); - } - - bool current_extracted() const { // current extracted? - return !current; - } - bool at_first() const; // Current is first? + ex_current_was_last = other_it->ex_current_was_last = false; + ex_current_was_cycle_pt = false; + other_it->ex_current_was_cycle_pt = false; - bool at_last() const; // Current is last? - - bool cycled_list() const; // Completed a cycle? - - void add_to_end( // add at end & - ELIST_LINK *new_link); // don't move - - void exchange( // positions of 2 links - ELIST_ITERATOR *other_it); // other iterator + temp_it.mark_cycle_pt(); + do { // walk sublist + if (temp_it.cycled_list()) { // can't find end pt + BAD_SUBLIST.error("Iterator.extract_sublist", ABORT); + } - //# elements in list - int32_t length() const { - return list->length(); - } + if (temp_it.at_last()) { + list->last = prev; + ex_current_was_last = other_it->ex_current_was_last = true; + } - void sort( // sort elements - int comparator( // comparison routine - const void *, const void *)); -}; + if (temp_it.current == cycle_pt) { + ex_current_was_cycle_pt = true; + } -/*********************************************************************** - * ELIST_ITERATOR::set_to_list - * - * (Re-)initialise the iterator to point to the start of the list_to_iterate - * over. - **********************************************************************/ + if (temp_it.current == other_it->cycle_pt) { + other_it->ex_current_was_cycle_pt = true; + } -inline void ELIST_ITERATOR::set_to_list( // change list - ELIST *list_to_iterate) { + temp_it.forward(); + } while (temp_it.prev != other_it->current); + + // circularise sublist + other_it->current->next = current; + end_of_new_list = other_it->current; + + // sublist = whole list + if (prev == other_it->current) { + list->last = nullptr; + prev = current = next = nullptr; + other_it->prev = other_it->current = other_it->next = nullptr; + } else { + prev->next = other_it->next; + current = other_it->current = nullptr; + next = other_it->next; + other_it->prev = prev; + } + return end_of_new_list; + } // to other current + + public: + Iterator() { // constructor + list = nullptr; + } // unassigned list + /*********************************************************************** + * ELIST_ITERATOR::ELIST_ITERATOR + * + * CONSTRUCTOR - set iterator to specified list; + **********************************************************************/ + Iterator(IntrusiveForwardList *list_to_iterate) { + set_to_list(list_to_iterate); + } + /*********************************************************************** + * ELIST_ITERATOR::set_to_list + * + * (Re-)initialise the iterator to point to the start of the list_to_iterate + * over. + **********************************************************************/ + void set_to_list( // change list + IntrusiveForwardList *list_to_iterate) { #ifndef NDEBUG - if (!list_to_iterate) { + if (!list_to_iterate) { BAD_PARAMETER.abort("ELIST_ITERATOR::set_to_list", "list_to_iterate is nullptr"); - } + } #endif - list = list_to_iterate; - prev = list->last; - current = list->First(); - next = current ? current->next : nullptr; - cycle_pt = nullptr; // await explicit set - started_cycling = false; - ex_current_was_last = false; - ex_current_was_cycle_pt = false; -} - -/*********************************************************************** - * ELIST_ITERATOR::ELIST_ITERATOR - * - * CONSTRUCTOR - set iterator to specified list; - **********************************************************************/ - -inline ELIST_ITERATOR::ELIST_ITERATOR(ELIST *list_to_iterate) { - set_to_list(list_to_iterate); -} - -/*********************************************************************** - * ELIST_ITERATOR::add_after_then_move - * - * Add a new element to the list after the current element and move the - * iterator to the new element. - **********************************************************************/ - -inline void ELIST_ITERATOR::add_after_then_move( // element to add - ELIST_LINK *new_element) { + list = list_to_iterate; + prev = list->last; + current = list->First(); + next = current ? current->next : nullptr; + cycle_pt = nullptr; // await explicit set + started_cycling = false; + ex_current_was_last = false; + ex_current_was_cycle_pt = false; + } + /*********************************************************************** + * ELIST_ITERATOR::add_after_then_move + * + * Add a new element to the list after the current element and move the + * iterator to the new element. + **********************************************************************/ + void add_after_then_move( // add after current & + T *new_element) { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST_ITERATOR::add_after_then_move"); - } - if (!new_element) { + } + if (!new_element) { BAD_PARAMETER.abort("ELIST_ITERATOR::add_after_then_move", "new_element is nullptr"); - } - if (new_element->next) { + } + if (new_element->next) { STILL_LINKED.abort("ELIST_ITERATOR::add_after_then_move"); - } + } #endif - if (list->empty()) { - new_element->next = new_element; - list->last = new_element; - prev = next = new_element; - } else { - new_element->next = next; - - if (current) { // not extracted - current->next = new_element; - prev = current; - if (current == list->last) { - list->last = new_element; - } - } else { // current extracted - prev->next = new_element; - if (ex_current_was_last) { + if (list->empty()) { + new_element->next = new_element; list->last = new_element; + prev = next = new_element; + } else { + new_element->next = next; + + if (current) { // not extracted + current->next = new_element; + prev = current; + if (current == list->last) { + list->last = new_element; + } + } else { // current extracted + prev->next = new_element; + if (ex_current_was_last) { + list->last = new_element; + } + if (ex_current_was_cycle_pt) { + cycle_pt = new_element; + } + } } - if (ex_current_was_cycle_pt) { - cycle_pt = new_element; - } - } - } - current = new_element; -} - -/*********************************************************************** - * ELIST_ITERATOR::add_after_stay_put - * - * Add a new element to the list after the current element but do not move - * the iterator to the new element. - **********************************************************************/ - -inline void ELIST_ITERATOR::add_after_stay_put( // element to add - ELIST_LINK *new_element) { + current = new_element; + } // move to new + /*********************************************************************** + * ELIST_ITERATOR::add_after_stay_put + * + * Add a new element to the list after the current element but do not move + * the iterator to the new element. + **********************************************************************/ + void add_after_stay_put( // add after current & + T *new_element) { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST_ITERATOR::add_after_stay_put"); - } - if (!new_element) { + } + if (!new_element) { BAD_PARAMETER.abort("ELIST_ITERATOR::add_after_stay_put", "new_element is nullptr"); - } - if (new_element->next) { + } + if (new_element->next) { STILL_LINKED.abort("ELIST_ITERATOR::add_after_stay_put"); - } + } #endif - if (list->empty()) { - new_element->next = new_element; - list->last = new_element; - prev = next = new_element; - ex_current_was_last = false; - current = nullptr; - } else { - new_element->next = next; - - if (current) { // not extracted - current->next = new_element; - if (prev == current) { - prev = new_element; - } - if (current == list->last) { - list->last = new_element; - } - } else { // current extracted - prev->next = new_element; - if (ex_current_was_last) { + if (list->empty()) { + new_element->next = new_element; list->last = new_element; + prev = next = new_element; ex_current_was_last = false; + current = nullptr; + } else { + new_element->next = next; + + if (current) { // not extracted + current->next = new_element; + if (prev == current) { + prev = new_element; + } + if (current == list->last) { + list->last = new_element; + } + } else { // current extracted + prev->next = new_element; + if (ex_current_was_last) { + list->last = new_element; + ex_current_was_last = false; + } + } + next = new_element; } - } - next = new_element; - } -} - -/*********************************************************************** - * ELIST_ITERATOR::add_before_then_move - * - * Add a new element to the list before the current element and move the - * iterator to the new element. - **********************************************************************/ - -inline void ELIST_ITERATOR::add_before_then_move( // element to add - ELIST_LINK *new_element) { + } // stay at current + /*********************************************************************** + * ELIST_ITERATOR::add_before_then_move + * + * Add a new element to the list before the current element and move the + * iterator to the new element. + **********************************************************************/ + void add_before_then_move( // add before current & + T *new_element) { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST_ITERATOR::add_before_then_move"); - } - if (!new_element) { + } + if (!new_element) { BAD_PARAMETER.abort("ELIST_ITERATOR::add_before_then_move", "new_element is nullptr"); - } - if (new_element->next) { + } + if (new_element->next) { STILL_LINKED.abort("ELIST_ITERATOR::add_before_then_move"); - } + } #endif - if (list->empty()) { - new_element->next = new_element; - list->last = new_element; - prev = next = new_element; - } else { - prev->next = new_element; - if (current) { // not extracted - new_element->next = current; - next = current; - } else { // current extracted - new_element->next = next; - if (ex_current_was_last) { + if (list->empty()) { + new_element->next = new_element; list->last = new_element; + prev = next = new_element; + } else { + prev->next = new_element; + if (current) { // not extracted + new_element->next = current; + next = current; + } else { // current extracted + new_element->next = next; + if (ex_current_was_last) { + list->last = new_element; + } + if (ex_current_was_cycle_pt) { + cycle_pt = new_element; + } + } } - if (ex_current_was_cycle_pt) { - cycle_pt = new_element; - } - } - } - current = new_element; -} - -/*********************************************************************** - * ELIST_ITERATOR::add_before_stay_put - * - * Add a new element to the list before the current element but don't move the - * iterator to the new element. - **********************************************************************/ - -inline void ELIST_ITERATOR::add_before_stay_put( // element to add - ELIST_LINK *new_element) { + current = new_element; + } // move to new + /*********************************************************************** + * ELIST_ITERATOR::add_before_stay_put + * + * Add a new element to the list before the current element but don't move the + * iterator to the new element. + **********************************************************************/ + void add_before_stay_put( // add before current & + T *new_element) { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST_ITERATOR::add_before_stay_put"); - } - if (!new_element) { + } + if (!new_element) { BAD_PARAMETER.abort("ELIST_ITERATOR::add_before_stay_put", "new_element is nullptr"); - } - if (new_element->next) { + } + if (new_element->next) { STILL_LINKED.abort("ELIST_ITERATOR::add_before_stay_put"); - } + } #endif - if (list->empty()) { - new_element->next = new_element; - list->last = new_element; - prev = next = new_element; - ex_current_was_last = true; - current = nullptr; - } else { - prev->next = new_element; - if (current) { // not extracted - new_element->next = current; - if (next == current) { - next = new_element; - } - } else { // current extracted - new_element->next = next; - if (ex_current_was_last) { + if (list->empty()) { + new_element->next = new_element; list->last = new_element; + prev = next = new_element; + ex_current_was_last = true; + current = nullptr; + } else { + prev->next = new_element; + if (current) { // not extracted + new_element->next = current; + if (next == current) { + next = new_element; + } + } else { // current extracted + new_element->next = next; + if (ex_current_was_last) { + list->last = new_element; + } + } + prev = new_element; } - } - prev = new_element; - } -} - -/*********************************************************************** - * ELIST_ITERATOR::add_list_after - * - * Insert another list to this list after the current element but don't move - *the - * iterator. - **********************************************************************/ - -inline void ELIST_ITERATOR::add_list_after(ELIST *list_to_add) { + } // stay at current + /*********************************************************************** + * ELIST_ITERATOR::add_list_after + * + * Insert another list to this list after the current element but don't move + *the + * iterator. + **********************************************************************/ + void add_list_after( // add a list & + IntrusiveForwardList *list_to_add) { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST_ITERATOR::add_list_after"); - } - if (!list_to_add) { + } + if (!list_to_add) { BAD_PARAMETER.abort("ELIST_ITERATOR::add_list_after", "list_to_add is nullptr"); - } + } #endif - if (!list_to_add->empty()) { - if (list->empty()) { - list->last = list_to_add->last; - prev = list->last; - next = list->First(); - ex_current_was_last = true; - current = nullptr; - } else { - if (current) { // not extracted - current->next = list_to_add->First(); - if (current == list->last) { - list->last = list_to_add->last; - } - list_to_add->last->next = next; - next = current->next; - } else { // current extracted - prev->next = list_to_add->First(); - if (ex_current_was_last) { + if (!list_to_add->empty()) { + if (list->empty()) { list->last = list_to_add->last; - ex_current_was_last = false; + prev = list->last; + next = list->First(); + ex_current_was_last = true; + current = nullptr; + } else { + if (current) { // not extracted + current->next = list_to_add->First(); + if (current == list->last) { + list->last = list_to_add->last; + } + list_to_add->last->next = next; + next = current->next; + } else { // current extracted + prev->next = list_to_add->First(); + if (ex_current_was_last) { + list->last = list_to_add->last; + ex_current_was_last = false; + } + list_to_add->last->next = next; + next = prev->next; + } } - list_to_add->last->next = next; - next = prev->next; + list_to_add->last = nullptr; } - } - list_to_add->last = nullptr; - } -} - -/*********************************************************************** - * ELIST_ITERATOR::add_list_before - * - * Insert another list to this list before the current element. Move the - * iterator to the start of the inserted elements - * iterator. - **********************************************************************/ - -inline void ELIST_ITERATOR::add_list_before(ELIST *list_to_add) { + } // stay at current + /*********************************************************************** + * ELIST_ITERATOR::add_list_before + * + * Insert another list to this list before the current element. Move the + * iterator to the start of the inserted elements + * iterator. + **********************************************************************/ + void add_list_before( // add a list & + IntrusiveForwardList *list_to_add) { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST_ITERATOR::add_list_before"); - } - if (!list_to_add) { + } + if (!list_to_add) { BAD_PARAMETER.abort("ELIST_ITERATOR::add_list_before", "list_to_add is nullptr"); - } + } #endif - if (!list_to_add->empty()) { - if (list->empty()) { - list->last = list_to_add->last; - prev = list->last; - current = list->First(); + if (!list_to_add->empty()) { + if (list->empty()) { + list->last = list_to_add->last; + prev = list->last; + current = list->First(); #ifndef NDEBUG // V522 There might be dereferencing of a potential null pointer 'current'. elst.h 579 if (!current) { BAD_PARAMETER.abort("ELIST_ITERATOR::add_list_before", "current is nullptr"); } #endif - next = current->next; - ex_current_was_last = false; - } else { - prev->next = list_to_add->First(); - if (current) { // not extracted - list_to_add->last->next = current; - } else { // current extracted - list_to_add->last->next = next; - if (ex_current_was_last) { - list->last = list_to_add->last; + next = current->next; + ex_current_was_last = false; + } else { + prev->next = list_to_add->First(); + if (current) { // not extracted + list_to_add->last->next = current; + } else { // current extracted + list_to_add->last->next = next; + if (ex_current_was_last) { + list->last = list_to_add->last; + } + if (ex_current_was_cycle_pt) { + cycle_pt = prev->next; + } + } + current = prev->next; + next = current->next; } + list_to_add->last = nullptr; + } + } // move to it 1st item + + T *data() { // get current data +#ifndef NDEBUG + if (!list) { + NO_LIST.error("ELIST_ITERATOR::data", ABORT); + } + if (!current) { + NULL_DATA.error("ELIST_ITERATOR::data", ABORT); + } +#endif + return current; + } + /*********************************************************************** + * ELIST_ITERATOR::data_relative + * + * Return the data pointer to the element "offset" elements from current. + * "offset" must not be less than -1. + * (This function can't be INLINEd because it contains a loop) + **********************************************************************/ + T *data_relative( // get data + or - ... + int8_t offset) { // offset from current + T *ptr; + +#ifndef NDEBUG + if (!list) + NO_LIST.error("ELIST_ITERATOR::data_relative", ABORT); + if (list->empty()) + EMPTY_LIST.error("ELIST_ITERATOR::data_relative", ABORT); + if (offset < -1) + BAD_PARAMETER.error("ELIST_ITERATOR::data_relative", ABORT, "offset < -l"); +#endif + + if (offset == -1) { + ptr = prev; + } else { + for (ptr = current ? current : prev; offset-- > 0; ptr = ptr->next) { + ; + } + } + +#ifndef NDEBUG + if (!ptr) + NULL_DATA.error("ELIST_ITERATOR::data_relative", ABORT); +#endif + + return ptr; + } // offset from current + /*********************************************************************** + * ELIST_ITERATOR::forward + * + * Move the iterator to the next element of the list. + * REMEMBER: ALL LISTS ARE CIRCULAR. + **********************************************************************/ + T *forward() { +#ifndef NDEBUG + if (!list) + NO_LIST.error("ELIST_ITERATOR::forward", ABORT); +#endif + if (list->empty()) { + return nullptr; + } + + if (current) { // not removed so + // set previous + prev = current; + started_cycling = true; + // In case next is deleted by another iterator, get next from current. + current = current->next; + } else { if (ex_current_was_cycle_pt) { - cycle_pt = prev->next; + cycle_pt = next; } + current = next; } - current = prev->next; +#ifndef NDEBUG + if (!current) + NULL_DATA.error("ELIST_ITERATOR::forward", ABORT); +#endif next = current->next; - } - list_to_add->last = nullptr; - } -} -/*********************************************************************** - * ELIST_ITERATOR::extract - * - * Do extraction by removing current from the list, returning it to the - * caller, but NOT updating the iterator. (So that any calling loop can do - * this.) The iterator's current points to nullptr. If the extracted element - * is to be deleted, this is the callers responsibility. - **********************************************************************/ - -inline ELIST_LINK *ELIST_ITERATOR::extract() { - ELIST_LINK *extracted_link; +#ifndef NDEBUG + if (!next) { + NULL_NEXT.error("ELIST_ITERATOR::forward", ABORT, + "This is: %p Current is: %p", + static_cast(this), + static_cast(current)); + } +#endif + return current; + } // move to next element + + /*********************************************************************** + * ELIST_ITERATOR::extract + * + * Do extraction by removing current from the list, returning it to the + * caller, but NOT updating the iterator. (So that any calling loop can do + * this.) The iterator's current points to nullptr. If the extracted element + * is to be deleted, this is the callers responsibility. + **********************************************************************/ + T *extract() { + T *extracted_link; #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST_ITERATOR::extract"); - } - if (!current) { // list empty or - // element extracted + } + if (!current) { // list empty or + // element extracted NULL_CURRENT.abort("ELIST_ITERATOR::extract"); - } + } #endif - if (list->singleton()) { - // Special case where we do need to change the iterator. - prev = next = list->last = nullptr; - } else { - prev->next = next; // remove from list - - ex_current_was_last = (current == list->last); - if (ex_current_was_last) { - list->last = prev; - } - } - // Always set ex_current_was_cycle_pt so an add/forward will work in a loop. - ex_current_was_cycle_pt = (current == cycle_pt); - extracted_link = current; - extracted_link->next = nullptr; // for safety - current = nullptr; - return extracted_link; -} - -/*********************************************************************** - * ELIST_ITERATOR::move_to_first() - * - * Move current so that it is set to the start of the list. - * Return data just in case anyone wants it. - **********************************************************************/ + if (list->singleton()) { + // Special case where we do need to change the iterator. + prev = next = list->last = nullptr; + } else { + prev->next = next; // remove from list -inline ELIST_LINK *ELIST_ITERATOR::move_to_first() { + ex_current_was_last = (current == list->last); + if (ex_current_was_last) { + list->last = prev; + } + } + // Always set ex_current_was_cycle_pt so an add/forward will work in a loop. + ex_current_was_cycle_pt = (current == cycle_pt); + extracted_link = current; + extracted_link->next = nullptr; // for safety + current = nullptr; + return extracted_link; + } // remove from list + /*********************************************************************** + * ELIST_ITERATOR::move_to_first() + * + * Move current so that it is set to the start of the list. + * Return data just in case anyone wants it. + **********************************************************************/ + T *move_to_first() { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST_ITERATOR::move_to_first"); - } + } #endif - current = list->First(); - prev = list->last; - next = current ? current->next : nullptr; - return current; -} + current = list->First(); + prev = list->last; + next = current ? current->next : nullptr; + return current; + } // go to start of list + /*********************************************************************** + * ELIST_ITERATOR::move_to_last() + * + * Move current so that it is set to the end of the list. + * Return data just in case anyone wants it. + * (This function can't be INLINEd because it contains a loop) + **********************************************************************/ + T *move_to_last() { +#ifndef NDEBUG + if (!list) + NO_LIST.error("ELIST_ITERATOR::move_to_last", ABORT); +#endif -/*********************************************************************** - * ELIST_ITERATOR::mark_cycle_pt() - * - * Remember the current location so that we can tell whether we've returned - * to this point later. - * - * If the current point is deleted either now, or in the future, the cycle - * point will be set to the next item which is set to current. This could be - * by a forward, add_after_then_move or add_after_then_move. - **********************************************************************/ + while (current != list->last) { + forward(); + } -inline void ELIST_ITERATOR::mark_cycle_pt() { + return current; + } // go to end of list + /*********************************************************************** + * ELIST_ITERATOR::mark_cycle_pt() + * + * Remember the current location so that we can tell whether we've returned + * to this point later. + * + * If the current point is deleted either now, or in the future, the cycle + * point will be set to the next item which is set to current. This could be + * by a forward, add_after_then_move or add_after_then_move. + **********************************************************************/ + void mark_cycle_pt() { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST_ITERATOR::mark_cycle_pt"); - } + } #endif - if (current) { - cycle_pt = current; - } else { - ex_current_was_cycle_pt = true; - } - started_cycling = false; -} + if (current) { + cycle_pt = current; + } else { + ex_current_was_cycle_pt = true; + } + started_cycling = false; + } // remember current -/*********************************************************************** - * ELIST_ITERATOR::at_first() - * - * Are we at the start of the list? - * - **********************************************************************/ + bool empty() const { // is list empty? +#ifndef NDEBUG + if (!list) { + NO_LIST.error("ELIST_ITERATOR::empty", ABORT); + } +#endif + return list->empty(); + } -inline bool ELIST_ITERATOR::at_first() const { + bool current_extracted() const { // current extracted? + return !current; + } + /*********************************************************************** + * ELIST_ITERATOR::at_first() + * + * Are we at the start of the list? + * + **********************************************************************/ + bool at_first() const { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST_ITERATOR::at_first"); - } + } #endif - // we're at a deleted - return ((list->empty()) || (current == list->First()) || - ((current == nullptr) && (prev == list->last) && // NON-last pt between - !ex_current_was_last)); // first and last -} - -/*********************************************************************** - * ELIST_ITERATOR::at_last() - * - * Are we at the end of the list? - * - **********************************************************************/ - -inline bool ELIST_ITERATOR::at_last() const { + // we're at a deleted + return ((list->empty()) || (current == list->First()) || + ((current == nullptr) && (prev == list->last) && // NON-last pt between + !ex_current_was_last)); // first and last + } // Current is first? + /*********************************************************************** + * ELIST_ITERATOR::at_last() + * + * Are we at the end of the list? + * + **********************************************************************/ + bool at_last() const { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST_ITERATOR::at_last"); - } + } #endif - // we're at a deleted - return ((list->empty()) || (current == list->last) || - ((current == nullptr) && (prev == list->last) && // last point between - ex_current_was_last)); // first and last -} - -/*********************************************************************** - * ELIST_ITERATOR::cycled_list() - * - * Have we returned to the cycle_pt since it was set? - * - **********************************************************************/ - -inline bool ELIST_ITERATOR::cycled_list() const { + // we're at a deleted + return ((list->empty()) || (current == list->last) || + ((current == nullptr) && (prev == list->last) && // last point between + ex_current_was_last)); // first and last + } // Current is last? + /*********************************************************************** + * ELIST_ITERATOR::cycled_list() + * + * Have we returned to the cycle_pt since it was set? + * + **********************************************************************/ + bool cycled_list() const { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST_ITERATOR::cycled_list"); - } + } #endif - return ((list->empty()) || ((current == cycle_pt) && started_cycling)); -} + return ((list->empty()) || ((current == cycle_pt) && started_cycling)); + } // Completed a cycle? + /*********************************************************************** + * ELIST_ITERATOR::add_to_end + * + * Add a new element to the end of the list without moving the iterator. + * This is provided because a single linked list cannot move to the last as + * the iterator couldn't set its prev pointer. Adding to the end is + * essential for implementing + queues. + **********************************************************************/ + void add_to_end( // add at end & + T *new_element) { +#ifndef NDEBUG + if (!list) { + NO_LIST.abort("ELIST_ITERATOR::add_to_end"); + } + if (!new_element) { + BAD_PARAMETER.abort("ELIST_ITERATOR::add_to_end", "new_element is nullptr"); + } + if (new_element->next) { + STILL_LINKED.abort("ELIST_ITERATOR::add_to_end"); + } +#endif -/*********************************************************************** - * ELIST_ITERATOR::sort() - * - * Sort the elements of the list, then reposition at the start. - * - **********************************************************************/ + if (this->at_last()) { + this->add_after_stay_put(new_element); + } else { + if (this->at_first()) { + this->add_before_stay_put(new_element); + list->last = new_element; + } else { // Iteratr is elsewhere + new_element->next = list->last->next; + list->last->next = new_element; + list->last = new_element; + } + } + } // don't move + /*********************************************************************** + * ELIST_ITERATOR::exchange() + * + * Given another iterator, whose current element is a different element on + * the same list list OR an element of another list, exchange the two current + * elements. On return, each iterator points to the element which was the + * other iterators current on entry. + * (This function hasn't been in-lined because its a bit big!) + **********************************************************************/ + void exchange( // positions of 2 links + Iterator *other_it) { // other iterator + constexpr ERRCODE DONT_EXCHANGE_DELETED("Can't exchange deleted elements of lists"); + + T *old_current; -inline void ELIST_ITERATOR::sort( // sort elements - int comparator( // comparison routine - const void *, const void *)) { #ifndef NDEBUG - if (!list) { - NO_LIST.abort("ELIST_ITERATOR::sort"); - } + if (!list) + NO_LIST.error("ELIST_ITERATOR::exchange", ABORT); + if (!other_it) + BAD_PARAMETER.error("ELIST_ITERATOR::exchange", ABORT, "other_it nullptr"); + if (!(other_it->list)) + NO_LIST.error("ELIST_ITERATOR::exchange", ABORT, "other_it"); #endif - list->sort(comparator); - move_to_first(); -} + /* Do nothing if either list is empty or if both iterators reference the same + link */ -/*********************************************************************** - * ELIST_ITERATOR::add_to_end - * - * Add a new element to the end of the list without moving the iterator. - * This is provided because a single linked list cannot move to the last as - * the iterator couldn't set its prev pointer. Adding to the end is - * essential for implementing - queues. -**********************************************************************/ + if ((list->empty()) || (other_it->list->empty()) || (current == other_it->current)) { + return; + } + + /* Error if either current element is deleted */ + + if (!current || !other_it->current) { + DONT_EXCHANGE_DELETED.error("ELIST_ITERATOR.exchange", ABORT); + } + + /* Now handle the 4 cases: doubleton list; non-doubleton adjacent elements + (other before this); non-doubleton adjacent elements (this before other); + non-adjacent elements. */ + + // adjacent links + if ((next == other_it->current) || (other_it->next == current)) { + // doubleton list + if ((next == other_it->current) && (other_it->next == current)) { + prev = next = current; + other_it->prev = other_it->next = other_it->current; + } else { // non-doubleton with + // adjacent links + // other before this + if (other_it->next == current) { + other_it->prev->next = current; + other_it->current->next = next; + current->next = other_it->current; + other_it->next = other_it->current; + prev = current; + } else { // this before other + prev->next = other_it->current; + current->next = other_it->next; + other_it->current->next = current; + next = current; + other_it->prev = other_it->current; + } + } + } else { // no overlap + prev->next = other_it->current; + current->next = other_it->next; + other_it->prev->next = current; + other_it->current->next = next; + } + + /* update end of list pointer when necessary (remember that the 2 iterators + may iterate over different lists!) */ + + if (list->last == current) { + list->last = other_it->current; + } + if (other_it->list->last == other_it->current) { + other_it->list->last = current; + } + + if (current == cycle_pt) { + cycle_pt = other_it->cycle_pt; + } + if (other_it->current == other_it->cycle_pt) { + other_it->cycle_pt = cycle_pt; + } + + /* The actual exchange - in all cases*/ -inline void ELIST_ITERATOR::add_to_end( // element to add - ELIST_LINK *new_element) { + old_current = current; + current = other_it->current; + other_it->current = old_current; + } // other iterator + + //# elements in list + int32_t length() const { + return list->length(); + } + /*********************************************************************** + * ELIST_ITERATOR::sort() + * + * Sort the elements of the list, then reposition at the start. + * + **********************************************************************/ + void sort( // sort elements + int comparator( // comparison routine + const T *, const T *)) { #ifndef NDEBUG - if (!list) { - NO_LIST.abort("ELIST_ITERATOR::add_to_end"); + if (!list) { + NO_LIST.error("ELIST_ITERATOR::sort", ABORT); + } +#endif + + list->sort(comparator); + move_to_first(); + } + }; + using ITERATOR = Iterator; // compat + +private: + T *last = nullptr; // End of list + //(Points to head) + T *First() { // return first + return last ? last->next : nullptr; } - if (!new_element) { - BAD_PARAMETER.abort("ELIST_ITERATOR::add_to_end", "new_element is nullptr"); + +public: + ~IntrusiveForwardList() { + clear(); } - if (new_element->next) { - STILL_LINKED.abort("ELIST_ITERATOR::add_to_end"); + + /* delete elements */ + void clear() { + internal_clear(); } -#endif - if (this->at_last()) { - this->add_after_stay_put(new_element); - } else { - if (this->at_first()) { - this->add_before_stay_put(new_element); - list->last = new_element; - } else { // Iterator is elsewhere - new_element->next = list->last->next; - list->last->next = new_element; - list->last = new_element; + /* Become a deep copy of src_list */ + template + void deep_copy(const U *src_list, T *(*copier)(const T *)) { + Iterator from_it(const_cast(src_list)); + Iterator to_it(this); + + for (from_it.mark_cycle_pt(); !from_it.cycled_list(); from_it.forward()) + to_it.add_after_then_move((*copier)(from_it.data())); + } + + /*********************************************************************** + * IntrusiveForwardList::internal_clear + * + * Used by the destructor and the "clear" member function of derived list + * classes to destroy all the elements on the list. + * The calling function passes a "zapper" function which can be called to + * delete each element of the list, regardless of its derived type. This + * technique permits a generic clear function to destroy elements of + * different derived types correctly, without requiring virtual functions and + * the consequential memory overhead. + **********************************************************************/ + + // destroy all links + void internal_clear() { + T *ptr; + T *next; + + if (!empty()) { + ptr = last->next; // set to first + last->next = nullptr; // break circle + last = nullptr; // set list empty + while (ptr) { + next = ptr->next; + delete ptr; + ptr = next; + } + } + } + + bool empty() const { + return !last; + } + + bool singleton() const { + return last ? (last == last->next) : false; + } + + void shallow_copy( // dangerous!! + IntrusiveForwardList *from_list) { // beware destructors!! + last = from_list->last; + } + + /*********************************************************************** + * IntrusiveForwardList::assign_to_sublist + * + * The list is set to a sublist of another list. "This" list must be empty + * before this function is invoked. The two iterators passed must refer to + * the same list, different from "this" one. The sublist removed is the + * inclusive list from start_it's current position to end_it's current + * position. If this range passes over the end of the source list then the + * source list has its end set to the previous element of start_it. The + * extracted sublist is unaffected by the end point of the source list, its + * end point is always the end_it position. + **********************************************************************/ + void assign_to_sublist( // to this list + Iterator *start_it, // from list start + Iterator *end_it) { // from list end + constexpr ERRCODE LIST_NOT_EMPTY("Destination list must be empty before extracting a sublist"); + + if (!empty()) { + LIST_NOT_EMPTY.error("IntrusiveForwardList.assign_to_sublist", ABORT); + } + + last = start_it->extract_sublist(end_it); + } // from list end + + // # elements in list + int32_t length() const { + int32_t count = 0; + if (last != nullptr) { + count = 1; + for (auto it = last->next; it != last; it = it->next) { + count++; + } + } + return count; + } + + /*********************************************************************** + * IntrusiveForwardList::sort + * + * Sort elements on list + * NB If you don't like the const declarations in the comparator, coerce yours: + * ( int (*)(const void *, const void *) + **********************************************************************/ + void sort( // sort elements + int comparator( // comparison routine + const T *, const T *)) { + // Allocate an array of pointers, one per list element. + auto count = length(); + + if (count > 0) { + // ptr array to sort + std::vector base; + base.reserve(count); + + Iterator it(this); + + // Extract all elements, putting the pointers in the array. + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + base.push_back(it.extract()); + } + + // Sort the pointer array. + std::sort(base.begin(), base.end(), + // all current comparators return -1,0,1, so we handle this correctly for std::sort + [&](auto &&l, auto &&r) {return comparator(l, r) < 0; }); + + // Rebuild the list from the sorted pointers. + for (auto current : base) { + it.add_to_end(current); + } + } + } + + // Assuming list has been sorted already, insert new_link to + // keep the list sorted according to the same comparison function. + // Comparison function is the same as used by sort, i.e. uses double + // indirection. Time is O(1) to add to beginning or end. + // Time is linear to add pre-sorted items to an empty list. + // If unique is set to true and comparator() returns 0 (an entry with the + // same information as the one contained in new_link is already in the + // list) - new_link is not added to the list and the function returns the + // pointer to the identical entry that already exists in the list + // (otherwise the function returns new_link). + T *add_sorted_and_find(int comparator(const T *, const T *), bool unique, + T *new_link) { + // Check for adding at the end. + if (last == nullptr || comparator(last, new_link) < 0) { + if (last == nullptr) { + new_link->next = new_link; + } else { + new_link->next = last->next; + last->next = new_link; + } + last = new_link; + } else { + // Need to use an iterator. + Iterator it(this); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + auto *link = it.data(); + int compare = comparator(link, new_link); + if (compare > 0) { + break; + } else if (unique && compare == 0) { + return link; + } + } + if (it.cycled_list()) { + it.add_to_end(new_link); + } else { + it.add_before_then_move(new_link); + } } + return new_link; } -} -#define ELISTIZEH(CLASSNAME) \ + // Same as above, but returns true if the new entry was inserted, false + // if the identical entry already existed in the list. + bool add_sorted(int comparator(const T *, const T *), bool unique, T *new_link) { + return (add_sorted_and_find(comparator, unique, new_link) == new_link); + } +}; + +template +using ELIST = IntrusiveForwardList; + +// add TESS_API? +// move templated lists to public include dirs? +#define ELISTIZEH(T) \ class CLASSNAME##_LIST; \ class CLASSNAME##_IT; \ \ - class CLASSNAME##_LIST : public X_LIST { \ - using X_LIST::X_LIST; \ - }; \ - class CLASSNAME##_IT : public X_ITER { \ - using X_ITER::X_ITER; \ + class T##_LIST : public IntrusiveForwardList { \ + public: \ + using IntrusiveForwardList::IntrusiveForwardList; \ + }; \ + class T##_IT : public IntrusiveForwardList::Iterator { \ + public: \ + using IntrusiveForwardList::Iterator::Iterator; \ } } // namespace tesseract diff --git a/src/ccutil/elst2.cpp b/src/ccutil/elst2.cpp deleted file mode 100644 index 99a6a374e2..0000000000 --- a/src/ccutil/elst2.cpp +++ /dev/null @@ -1,486 +0,0 @@ -/********************************************************************** - * File: elst2.cpp (Formerly elist2.c) - * Description: Doubly linked embedded list code not in the include file. - * Author: Phil Cheatle - * - * (C) Copyright 1991, Hewlett-Packard Ltd. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include // compiler config, etc. - -#include "elst2.h" - -#include - -namespace tesseract { - -/*********************************************************************** - * ELIST2::internal_clear - * - * Used by the destructor and the "clear" member function of derived list - * classes to destroy all the elements on the list. - * The calling function passes a "zapper" function which can be called to - * delete each element of the list, regardless of its derived type. This - * technique permits a generic clear function to destroy elements of - * different derived types correctly, without requiring virtual functions and - * the consequential memory overhead. - **********************************************************************/ - -void ELIST2::internal_clear( // destroy all links - void (*zapper)(void *)) { - // ptr to zapper functn - ELIST2_LINK *ptr; - ELIST2_LINK *next; - - if (!empty()) { - ptr = last->next; // set to first - last->next = nullptr; // break circle - last = nullptr; // set list empty - while (ptr) { - next = ptr->next; - zapper(ptr); - ptr = next; - } - } -} - -/*********************************************************************** - * ELIST2::assign_to_sublist - * - * The list is set to a sublist of another list. "This" list must be empty - * before this function is invoked. The two iterators passed must refer to - * the same list, different from "this" one. The sublist removed is the - * inclusive list from start_it's current position to end_it's current - * position. If this range passes over the end of the source list then the - * source list has its end set to the previous element of start_it. The - * extracted sublist is unaffected by the end point of the source list, its - * end point is always the end_it position. - **********************************************************************/ - -void ELIST2::assign_to_sublist( // to this list - ELIST2_ITERATOR *start_it, // from list start - ELIST2_ITERATOR *end_it) { // from list end - constexpr ERRCODE LIST_NOT_EMPTY("Destination list must be empty before extracting a sublist"); - - if (!empty()) { - LIST_NOT_EMPTY.abort("ELIST2.assign_to_sublist"); - } - - last = start_it->extract_sublist(end_it); -} - -/*********************************************************************** - * ELIST2::sort - * - * Sort elements on list - * NB If you don't like the const declarations in the comparator, coerce yours: - * (int (*)(const void *, const void *) - **********************************************************************/ - -void ELIST2::sort( // sort elements - int comparator( // comparison routine - const void *, const void *)) { - // Allocate an array of pointers, one per list element. - auto count = length(); - if (count > 0) { - // ptr array to sort - std::vector base; - base.reserve(count); - - ELIST2_ITERATOR it(this); - - // Extract all elements, putting the pointers in the array. - for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { - base.push_back(it.extract()); - } - - // Sort the pointer array. - qsort(&base[0], count, sizeof(base[0]), comparator); - - // Rebuild the list from the sorted pointers. - for (auto current : base) { - it.add_to_end(current); - } - } -} - -// Assuming list has been sorted already, insert new_link to -// keep the list sorted according to the same comparison function. -// Comparison function is the same as used by sort, i.e. uses double -// indirection. Time is O(1) to add to beginning or end. -// Time is linear to add pre-sorted items to an empty list. -void ELIST2::add_sorted(int comparator(const void *, const void *), ELIST2_LINK *new_link) { - // Check for adding at the end. - if (last == nullptr || comparator(&last, &new_link) < 0) { - if (last == nullptr) { - new_link->next = new_link; - new_link->prev = new_link; - } else { - new_link->next = last->next; - new_link->prev = last; - last->next = new_link; - new_link->next->prev = new_link; - } - last = new_link; - } else { - // Need to use an iterator. - ELIST2_ITERATOR it(this); - for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { - ELIST2_LINK *link = it.data(); - if (comparator(&link, &new_link) > 0) { - break; - } - } - if (it.cycled_list()) { - it.add_to_end(new_link); - } else { - it.add_before_then_move(new_link); - } - } -} - -/*********************************************************************** - * MEMBER FUNCTIONS OF CLASS: ELIST2_ITERATOR - * ========================================== - **********************************************************************/ - -/*********************************************************************** - * ELIST2_ITERATOR::forward - * - * Move the iterator to the next element of the list. - * REMEMBER: ALL LISTS ARE CIRCULAR. - **********************************************************************/ - -ELIST2_LINK *ELIST2_ITERATOR::forward() { -#ifndef NDEBUG - if (!list) - NO_LIST.abort("ELIST2_ITERATOR::forward"); -#endif - if (list->empty()) { - return nullptr; - } - - if (current) { // not removed so - // set previous - prev = current; - started_cycling = true; - // In case next is deleted by another iterator, get it from the current. - current = current->next; - } else { - if (ex_current_was_cycle_pt) { - cycle_pt = next; - } - current = next; - } - -#ifndef NDEBUG - if (!current) - NULL_DATA.abort("ELIST2_ITERATOR::forward"); -#endif - - next = current->next; - -#ifndef NDEBUG - if (!next) { - NULL_NEXT.abort("ELIST2_ITERATOR::forward", - "This is: %p Current is: %p", - static_cast(this), - static_cast(current)); - } -#endif - - return current; -} - -/*********************************************************************** - * ELIST2_ITERATOR::backward - * - * Move the iterator to the previous element of the list. - * REMEMBER: ALL LISTS ARE CIRCULAR. - **********************************************************************/ - -ELIST2_LINK *ELIST2_ITERATOR::backward() { -#ifndef NDEBUG - if (!list) - NO_LIST.abort("ELIST2_ITERATOR::backward"); -#endif - if (list->empty()) { - return nullptr; - } - - if (current) { // not removed so - // set previous - next = current; - started_cycling = true; - // In case prev is deleted by another iterator, get it from current. - current = current->prev; - } else { - if (ex_current_was_cycle_pt) { - cycle_pt = prev; - } - current = prev; - } - -#ifndef NDEBUG - if (!current) - NULL_DATA.abort("ELIST2_ITERATOR::backward"); - if (!prev) { - NULL_PREV.abort("ELIST2_ITERATOR::backward", - "This is: %p Current is: %p", - static_cast(this), - static_cast(current)); - } -#endif - - prev = current->prev; - return current; -} - -/*********************************************************************** - * ELIST2_ITERATOR::data_relative - * - * Return the data pointer to the element "offset" elements from current. - * (This function can't be INLINEd because it contains a loop) - **********************************************************************/ - -ELIST2_LINK *ELIST2_ITERATOR::data_relative( // get data + or - .. - int8_t offset) { // offset from current - ELIST2_LINK *ptr; - -#ifndef NDEBUG - if (!list) - NO_LIST.abort("ELIST2_ITERATOR::data_relative"); - if (list->empty()) - EMPTY_LIST.abort("ELIST2_ITERATOR::data_relative"); -#endif - - if (offset < 0) { - for (ptr = current ? current : next; offset++ < 0; ptr = ptr->prev) { -#ifndef NDEBUG - if (!ptr) - BAD_PARAMETER.error("ELIST2_ITERATOR::data_relative", ABORT, "ptr == nullptr"); -#endif - ; - } - } else { - for (ptr = current ? current : prev; offset-- > 0; ptr = ptr->next) { -#ifndef NDEBUG - if (!ptr) - BAD_PARAMETER.error("ELIST2_ITERATOR::data_relative", ABORT, "ptr == nullptr"); -#endif - ; - } - } - -#ifndef NDEBUG - if (!ptr) - NULL_DATA.abort("ELIST2_ITERATOR::data_relative"); -#endif - - return ptr; -} - -/*********************************************************************** - * ELIST2_ITERATOR::exchange() - * - * Given another iterator, whose current element is a different element on - * the same list list OR an element of another list, exchange the two current - * elements. On return, each iterator points to the element which was the - * other iterators current on entry. - * (This function hasn't been in-lined because its a bit big!) - **********************************************************************/ - -void ELIST2_ITERATOR::exchange( // positions of 2 links - ELIST2_ITERATOR *other_it) { // other iterator - constexpr ERRCODE DONT_EXCHANGE_DELETED("Can't exchange deleted elements of lists"); - - ELIST2_LINK *old_current; - -#ifndef NDEBUG - if (!list) - NO_LIST.abort("ELIST2_ITERATOR::exchange"); - if (!other_it) - BAD_PARAMETER.abort("ELIST2_ITERATOR::exchange", "other_it nullptr"); - if (!(other_it->list)) - NO_LIST.abort("ELIST2_ITERATOR::exchange", "other_it"); -#endif - - /* Do nothing if either list is empty or if both iterators reference the same -link */ - - if ((list->empty()) || (other_it->list->empty()) || (current == other_it->current)) { - return; - } - - /* Error if either current element is deleted */ - - if (!current || !other_it->current) { - DONT_EXCHANGE_DELETED.abort("ELIST2_ITERATOR.exchange"); - } - - /* Now handle the 4 cases: doubleton list; non-doubleton adjacent elements -(other before this); non-doubleton adjacent elements (this before other); -non-adjacent elements. */ - - // adjacent links - if ((next == other_it->current) || (other_it->next == current)) { - // doubleton list - if ((next == other_it->current) && (other_it->next == current)) { - prev = next = current; - other_it->prev = other_it->next = other_it->current; - } else { // non-doubleton with - // adjacent links - // other before this - if (other_it->next == current) { - other_it->prev->next = current; - other_it->current->next = next; - other_it->current->prev = current; - current->next = other_it->current; - current->prev = other_it->prev; - next->prev = other_it->current; - - other_it->next = other_it->current; - prev = current; - } else { // this before other - prev->next = other_it->current; - current->next = other_it->next; - current->prev = other_it->current; - other_it->current->next = current; - other_it->current->prev = prev; - other_it->next->prev = current; - - next = current; - other_it->prev = other_it->current; - } - } - } else { // no overlap - prev->next = other_it->current; - current->next = other_it->next; - current->prev = other_it->prev; - next->prev = other_it->current; - other_it->prev->next = current; - other_it->current->next = next; - other_it->current->prev = prev; - other_it->next->prev = current; - } - - /* update end of list pointer when necessary (remember that the 2 iterators - may iterate over different lists!) */ - - if (list->last == current) { - list->last = other_it->current; - } - if (other_it->list->last == other_it->current) { - other_it->list->last = current; - } - - if (current == cycle_pt) { - cycle_pt = other_it->cycle_pt; - } - if (other_it->current == other_it->cycle_pt) { - other_it->cycle_pt = cycle_pt; - } - - /* The actual exchange - in all cases*/ - - old_current = current; - current = other_it->current; - other_it->current = old_current; -} - -/*********************************************************************** - * ELIST2_ITERATOR::extract_sublist() - * - * This is a private member, used only by ELIST2::assign_to_sublist. - * Given another iterator for the same list, extract the links from THIS to - * OTHER inclusive, link them into a new circular list, and return a - * pointer to the last element. - * (Can't inline this function because it contains a loop) - **********************************************************************/ - -ELIST2_LINK *ELIST2_ITERATOR::extract_sublist( // from this current - ELIST2_ITERATOR *other_it) { // to other current -#ifndef NDEBUG - constexpr ERRCODE BAD_EXTRACTION_PTS("Can't extract sublist from points on different lists"); - constexpr ERRCODE DONT_EXTRACT_DELETED("Can't extract a sublist marked by deleted points"); -#endif - constexpr ERRCODE BAD_SUBLIST("Can't find sublist end point in original list"); - - ELIST2_ITERATOR temp_it = *this; - ELIST2_LINK *end_of_new_list; - -#ifndef NDEBUG - if (!other_it) - BAD_PARAMETER.abort("ELIST2_ITERATOR::extract_sublist", "other_it nullptr"); - if (!list) - NO_LIST.abort("ELIST2_ITERATOR::extract_sublist"); - if (list != other_it->list) - BAD_EXTRACTION_PTS.abort("ELIST2_ITERATOR.extract_sublist"); - if (list->empty()) - EMPTY_LIST.abort("ELIST2_ITERATOR::extract_sublist"); - - if (!current || !other_it->current) - DONT_EXTRACT_DELETED.abort("ELIST2_ITERATOR.extract_sublist"); -#endif - - ex_current_was_last = other_it->ex_current_was_last = false; - ex_current_was_cycle_pt = false; - other_it->ex_current_was_cycle_pt = false; - - temp_it.mark_cycle_pt(); - do { // walk sublist - if (temp_it.cycled_list()) { // can't find end pt - BAD_SUBLIST.abort("ELIST2_ITERATOR.extract_sublist"); - } - - if (temp_it.at_last()) { - list->last = prev; - ex_current_was_last = other_it->ex_current_was_last = true; - } - - if (temp_it.current == cycle_pt) { - ex_current_was_cycle_pt = true; - } - - if (temp_it.current == other_it->cycle_pt) { - other_it->ex_current_was_cycle_pt = true; - } - - temp_it.forward(); - } - // do INCLUSIVE list - while (temp_it.prev != other_it->current); - - // circularise sublist - other_it->current->next = current; - // circularise sublist - current->prev = other_it->current; - end_of_new_list = other_it->current; - - // sublist = whole list - if (prev == other_it->current) { - list->last = nullptr; - prev = current = next = nullptr; - other_it->prev = other_it->current = other_it->next = nullptr; - } else { - prev->next = other_it->next; - other_it->next->prev = prev; - - current = other_it->current = nullptr; - next = other_it->next; - other_it->prev = prev; - } - return end_of_new_list; -} - -} // namespace tesseract diff --git a/src/ccutil/elst2.h b/src/ccutil/elst2.h index 1da1413e2f..4331541a9d 100644 --- a/src/ccutil/elst2.h +++ b/src/ccutil/elst2.h @@ -19,16 +19,14 @@ #ifndef ELST2_H #define ELST2_H -#include "list.h" #include "lsterr.h" #include "serialis.h" +#include #include namespace tesseract { -class ELIST2_ITERATOR; - /********************************************************************** DESIGN NOTE =========== @@ -47,810 +45,1156 @@ i) The duplication in source does not affect the run time code size - the ii) The compiler should have a bit less work to do! **********************************************************************/ -/********************************************************************** - * CLASS - ELIST2_LINK - * - * Generic link class for doubly linked lists with embedded links - * - * Note: No destructor - elements are assumed to be destroyed EITHER after - * they have been extracted from a list OR by the ELIST2 destructor which - * walks the list. - **********************************************************************/ - -class ELIST2_LINK { - friend class ELIST2_ITERATOR; - friend class ELIST2; - - ELIST2_LINK *prev; - ELIST2_LINK *next; - -public: - ELIST2_LINK() { // constructor - prev = next = nullptr; - } - - ELIST2_LINK(const ELIST2_LINK &) = delete; - - // The assignment operator is required for WERD. - void operator=(const ELIST2_LINK &) { - prev = next = nullptr; - } -}; - /********************************************************************** * CLASS - ELIST2 * * Generic list class for doubly linked lists with embedded links **********************************************************************/ -class TESS_API ELIST2 { - friend class ELIST2_ITERATOR; - - ELIST2_LINK *last = nullptr; // End of list - //(Points to head) - ELIST2_LINK *First() { // return first - return last ? last->next : nullptr; - } - +template +class IntrusiveList { public: - // destroy all links - void internal_clear(void (*zapper)(void *)); - - bool empty() const { // is list empty? - return !last; - } - - bool singleton() const { - return last ? (last == last->next) : false; - } - - void shallow_copy( // dangerous!! - ELIST2 *from_list) { // beware destructors!! - last = from_list->last; - } - - // ptr to copier functn - void internal_deep_copy(ELIST2_LINK *(*copier)(ELIST2_LINK *), - const ELIST2 *list); // list being copied - - void assign_to_sublist( // to this list - ELIST2_ITERATOR *start_it, // from list start - ELIST2_ITERATOR *end_it); // from list end - - // # elements in list - int32_t length() const { - int32_t count = 0; - if (last != nullptr) { - count = 1; - for (auto it = last->next; it != last; it = it->next) { - count++; - } + /********************************************************************** + * CLASS - Link + * + * Generic link class for doubly linked lists with embedded links + * + * Note: No destructor - elements are assumed to be destroyed EITHER after + * they have been extracted from a list OR by the ELIST2 destructor which + * walks the list. + **********************************************************************/ + + class Link { + friend class Iterator; + friend class IntrusiveList; + + T *prev; + T *next; + + public: + Link() { // constructor + prev = next = nullptr; } - return count; - } - - void sort( // sort elements - int comparator( // comparison routine - const void *, const void *)); - - // Assuming list has been sorted already, insert new_link to - // keep the list sorted according to the same comparison function. - // Comparison function is the same as used by sort, i.e. uses double - // indirection. Time is O(1) to add to beginning or end. - // Time is linear to add pre-sorted items to an empty list. - void add_sorted(int comparator(const void *, const void *), ELIST2_LINK *new_link); -}; - -/*********************************************************************** - * CLASS - ELIST2_ITERATOR - * - * Generic iterator class for doubly linked lists with embedded - *links - **********************************************************************/ - -class TESS_API ELIST2_ITERATOR { - friend void ELIST2::assign_to_sublist(ELIST2_ITERATOR *, ELIST2_ITERATOR *); - - ELIST2 *list; // List being iterated - ELIST2_LINK *prev; // prev element - ELIST2_LINK *current; // current element - ELIST2_LINK *next; // next element - ELIST2_LINK *cycle_pt; // point we are cycling the list to. - bool ex_current_was_last; // current extracted was end of list - bool ex_current_was_cycle_pt; // current extracted was cycle point - bool started_cycling; // Have we moved off the start? - - ELIST2_LINK *extract_sublist( // from this current... - ELIST2_ITERATOR *other_it); // to other current - -public: - ELIST2_ITERATOR( // constructor - ELIST2 *list_to_iterate); - - void set_to_list( // change list - ELIST2 *list_to_iterate); - - void add_after_then_move( // add after current & - ELIST2_LINK *new_link); // move to new - - void add_after_stay_put( // add after current & - ELIST2_LINK *new_link); // stay at current - void add_before_then_move( // add before current & - ELIST2_LINK *new_link); // move to new + Link(const Link &) = delete; - void add_before_stay_put( // add before current & - ELIST2_LINK *new_link); // stay at current - - void add_list_after( // add a list & - ELIST2 *list_to_add); // stay at current - - void add_list_before( // add a list & - ELIST2 *list_to_add); // move to it 1st item - - ELIST2_LINK *data() { // get current data -#ifndef NDEBUG - if (!current) { - NULL_DATA.abort("ELIST2_ITERATOR::data"); - } - if (!list) { - NO_LIST.abort("ELIST2_ITERATOR::data"); + // The assignment operator is required for WERD. + void operator=(const Link &) { + prev = next = nullptr; } + }; + using LINK = Link; // compat + + /*********************************************************************** + * CLASS - ELIST2_ITERATOR + * + * Generic iterator class for doubly linked lists with embedded + *links + **********************************************************************/ + + class Iterator { + friend void IntrusiveList::assign_to_sublist(Iterator *, Iterator *); + + IntrusiveList *list; // List being iterated + T *prev; // prev element + T *current; // current element + T *next; // next element + T *cycle_pt; // point we are cycling the list to. + bool ex_current_was_last; // current extracted was end of list + bool ex_current_was_cycle_pt; // current extracted was cycle point + bool started_cycling; // Have we moved off the start? + /*********************************************************************** + * ELIST2_ITERATOR::extract_sublist() + * + * This is a private member, used only by IntrusiveList::assign_to_sublist. + * Given another iterator for the same list, extract the links from THIS to + * OTHER inclusive, link them into a new circular list, and return a + * pointer to the last element. + * (Can't inline this function because it contains a loop) + **********************************************************************/ + T *extract_sublist( // from this current... + Iterator *other_it) { // to other current +#ifndef NDEBUG + constexpr ERRCODE BAD_EXTRACTION_PTS("Can't extract sublist from points on different lists"); + constexpr ERRCODE DONT_EXTRACT_DELETED("Can't extract a sublist marked by deleted points"); #endif - return current; - } - - ELIST2_LINK *data_relative( // get data + or - ... - int8_t offset); // offset from current - - ELIST2_LINK *forward(); // move to next element - - ELIST2_LINK *backward(); // move to prev element - - ELIST2_LINK *extract(); // remove from list - - // go to start of list - ELIST2_LINK *move_to_first(); - - ELIST2_LINK *move_to_last(); // go to end of list + constexpr ERRCODE BAD_SUBLIST("Can't find sublist end point in original list"); - void mark_cycle_pt(); // remember current + Iterator temp_it = *this; + T *end_of_new_list; - bool empty() const { // is list empty? #ifndef NDEBUG - if (!list) { - NO_LIST.abort("ELIST2_ITERATOR::empty"); - } + if (!other_it) + BAD_PARAMETER.error("ELIST2_ITERATOR::extract_sublist", ABORT, "other_it nullptr"); + if (!list) + NO_LIST.error("ELIST2_ITERATOR::extract_sublist", ABORT); + if (list != other_it->list) + BAD_EXTRACTION_PTS.error("ELIST2_ITERATOR.extract_sublist", ABORT); + if (list->empty()) + EMPTY_LIST.error("ELIST2_ITERATOR::extract_sublist", ABORT); + + if (!current || !other_it->current) + DONT_EXTRACT_DELETED.error("ELIST2_ITERATOR.extract_sublist", ABORT); #endif - return list->empty(); - } - - bool current_extracted() const { // current extracted? - return !current; - } - - bool at_first() const; // Current is first? - - bool at_last() const; // Current is last? - bool cycled_list() const; // Completed a cycle? + ex_current_was_last = other_it->ex_current_was_last = false; + ex_current_was_cycle_pt = false; + other_it->ex_current_was_cycle_pt = false; - void add_to_end( // add at end & - ELIST2_LINK *new_link); // don't move + temp_it.mark_cycle_pt(); + do { // walk sublist + if (temp_it.cycled_list()) { // can't find end pt + BAD_SUBLIST.error("ELIST2_ITERATOR.extract_sublist", ABORT); + } - void exchange( // positions of 2 links - ELIST2_ITERATOR *other_it); // other iterator + if (temp_it.at_last()) { + list->last = prev; + ex_current_was_last = other_it->ex_current_was_last = true; + } - //# elements in list - int32_t length() const { - return list->length(); - } + if (temp_it.current == cycle_pt) { + ex_current_was_cycle_pt = true; + } - void sort( // sort elements - int comparator( // comparison routine - const void *, const void *)); + if (temp_it.current == other_it->cycle_pt) { + other_it->ex_current_was_cycle_pt = true; + } -private: - // Don't use the following constructor. - ELIST2_ITERATOR() = delete; -}; + temp_it.forward(); + } + // do INCLUSIVE list + while (temp_it.prev != other_it->current); + + // circularise sublist + other_it->current->next = current; + // circularise sublist + current->prev = other_it->current; + end_of_new_list = other_it->current; + + // sublist = whole list + if (prev == other_it->current) { + list->last = nullptr; + prev = current = next = nullptr; + other_it->prev = other_it->current = other_it->next = nullptr; + } else { + prev->next = other_it->next; + other_it->next->prev = prev; + + current = other_it->current = nullptr; + next = other_it->next; + other_it->prev = prev; + } + return end_of_new_list; + } // to other current + + public: + /*********************************************************************** + * ELIST2_ITERATOR::ELIST2_ITERATOR + * + * CONSTRUCTOR - set iterator to specified list; + **********************************************************************/ + Iterator( // constructor + IntrusiveList *list_to_iterate) { + set_to_list(list_to_iterate); + } -/*********************************************************************** - * ELIST2_ITERATOR::set_to_list - * - * (Re-)initialise the iterator to point to the start of the list_to_iterate - * over. - **********************************************************************/ + /*********************************************************************** + * ELIST2_ITERATOR::set_to_list + * + * (Re-)initialise the iterator to point to the start of the list_to_iterate + * over. + **********************************************************************/ -inline void ELIST2_ITERATOR::set_to_list( // change list - ELIST2 *list_to_iterate) { + void set_to_list( // change list + IntrusiveList *list_to_iterate) { #ifndef NDEBUG - if (!list_to_iterate) { + if (!list_to_iterate) { BAD_PARAMETER.abort("ELIST2_ITERATOR::set_to_list", "list_to_iterate is nullptr"); - } + } #endif - list = list_to_iterate; - prev = list->last; - current = list->First(); - next = current ? current->next : nullptr; - cycle_pt = nullptr; // await explicit set - started_cycling = false; - ex_current_was_last = false; - ex_current_was_cycle_pt = false; -} - -/*********************************************************************** - * ELIST2_ITERATOR::ELIST2_ITERATOR - * - * CONSTRUCTOR - set iterator to specified list; - **********************************************************************/ - -inline ELIST2_ITERATOR::ELIST2_ITERATOR(ELIST2 *list_to_iterate) { - set_to_list(list_to_iterate); -} - -/*********************************************************************** - * ELIST2_ITERATOR::add_after_then_move - * - * Add a new element to the list after the current element and move the - * iterator to the new element. - **********************************************************************/ - -inline void ELIST2_ITERATOR::add_after_then_move( // element to add - ELIST2_LINK *new_element) { + list = list_to_iterate; + prev = list->last; + current = list->First(); + next = current ? current->next : nullptr; + cycle_pt = nullptr; // await explicit set + started_cycling = false; + ex_current_was_last = false; + ex_current_was_cycle_pt = false; + } + /*********************************************************************** + * ELIST2_ITERATOR::add_after_then_move + * + * Add a new element to the list after the current element and move the + * iterator to the new element. + **********************************************************************/ + void add_after_then_move( // add after current & + T *new_element) { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST2_ITERATOR::add_after_then_move"); - } - if (!new_element) { + } + if (!new_element) { BAD_PARAMETER.abort("ELIST2_ITERATOR::add_after_then_move", "new_element is nullptr"); - } - if (new_element->next) { + } + if (new_element->next) { STILL_LINKED.abort("ELIST2_ITERATOR::add_after_then_move"); - } + } #endif - if (list->empty()) { - new_element->next = new_element; - new_element->prev = new_element; - list->last = new_element; - prev = next = new_element; - } else { - new_element->next = next; - next->prev = new_element; - - if (current) { // not extracted - new_element->prev = current; - current->next = new_element; - prev = current; - if (current == list->last) { - list->last = new_element; - } - } else { // current extracted - new_element->prev = prev; - prev->next = new_element; - if (ex_current_was_last) { + if (list->empty()) { + new_element->next = new_element; + new_element->prev = new_element; list->last = new_element; + prev = next = new_element; + } else { + new_element->next = next; + next->prev = new_element; + + if (current) { // not extracted + new_element->prev = current; + current->next = new_element; + prev = current; + if (current == list->last) { + list->last = new_element; + } + } else { // current extracted + new_element->prev = prev; + prev->next = new_element; + if (ex_current_was_last) { + list->last = new_element; + } + if (ex_current_was_cycle_pt) { + cycle_pt = new_element; + } + } } - if (ex_current_was_cycle_pt) { - cycle_pt = new_element; - } - } - } - current = new_element; -} - -/*********************************************************************** - * ELIST2_ITERATOR::add_after_stay_put - * - * Add a new element to the list after the current element but do not move - * the iterator to the new element. - **********************************************************************/ - -inline void ELIST2_ITERATOR::add_after_stay_put( // element to add - ELIST2_LINK *new_element) { + current = new_element; + } // move to new + /*********************************************************************** + * ELIST2_ITERATOR::add_after_stay_put + * + * Add a new element to the list after the current element but do not move + * the iterator to the new element. + **********************************************************************/ + void add_after_stay_put( // add after current & + T *new_element) { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST2_ITERATOR::add_after_stay_put"); - } - if (!new_element) { + } + if (!new_element) { BAD_PARAMETER.abort("ELIST2_ITERATOR::add_after_stay_put", "new_element is nullptr"); - } - if (new_element->next) { + } + if (new_element->next) { STILL_LINKED.abort("ELIST2_ITERATOR::add_after_stay_put"); - } + } #endif - if (list->empty()) { - new_element->next = new_element; - new_element->prev = new_element; - list->last = new_element; - prev = next = new_element; - ex_current_was_last = false; - current = nullptr; - } else { - new_element->next = next; - next->prev = new_element; - - if (current) { // not extracted - new_element->prev = current; - current->next = new_element; - if (prev == current) { - prev = new_element; - } - if (current == list->last) { - list->last = new_element; - } - } else { // current extracted - new_element->prev = prev; - prev->next = new_element; - if (ex_current_was_last) { + if (list->empty()) { + new_element->next = new_element; + new_element->prev = new_element; list->last = new_element; + prev = next = new_element; ex_current_was_last = false; + current = nullptr; + } else { + new_element->next = next; + next->prev = new_element; + + if (current) { // not extracted + new_element->prev = current; + current->next = new_element; + if (prev == current) { + prev = new_element; + } + if (current == list->last) { + list->last = new_element; + } + } else { // current extracted + new_element->prev = prev; + prev->next = new_element; + if (ex_current_was_last) { + list->last = new_element; + ex_current_was_last = false; + } + } + next = new_element; } - } - next = new_element; - } -} - -/*********************************************************************** - * ELIST2_ITERATOR::add_before_then_move - * - * Add a new element to the list before the current element and move the - * iterator to the new element. - **********************************************************************/ - -inline void ELIST2_ITERATOR::add_before_then_move( // element to add - ELIST2_LINK *new_element) { + } // stay at current + /*********************************************************************** + * ELIST2_ITERATOR::add_before_then_move + * + * Add a new element to the list before the current element and move the + * iterator to the new element. + **********************************************************************/ + void add_before_then_move( // add before current & + T *new_element) { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST2_ITERATOR::add_before_then_move"); - } - if (!new_element) { + } + if (!new_element) { BAD_PARAMETER.abort("ELIST2_ITERATOR::add_before_then_move", "new_element is nullptr"); - } - if (new_element->next) { + } + if (new_element->next) { STILL_LINKED.abort("ELIST2_ITERATOR::add_before_then_move"); - } + } #endif - if (list->empty()) { - new_element->next = new_element; - new_element->prev = new_element; - list->last = new_element; - prev = next = new_element; - } else { - prev->next = new_element; - new_element->prev = prev; - - if (current) { // not extracted - new_element->next = current; - current->prev = new_element; - next = current; - } else { // current extracted - new_element->next = next; - next->prev = new_element; - if (ex_current_was_last) { + if (list->empty()) { + new_element->next = new_element; + new_element->prev = new_element; list->last = new_element; + prev = next = new_element; + } else { + prev->next = new_element; + new_element->prev = prev; + + if (current) { // not extracted + new_element->next = current; + current->prev = new_element; + next = current; + } else { // current extracted + new_element->next = next; + next->prev = new_element; + if (ex_current_was_last) { + list->last = new_element; + } + if (ex_current_was_cycle_pt) { + cycle_pt = new_element; + } + } } - if (ex_current_was_cycle_pt) { - cycle_pt = new_element; - } - } - } - current = new_element; -} - -/*********************************************************************** - * ELIST2_ITERATOR::add_before_stay_put - * - * Add a new element to the list before the current element but don't move the - * iterator to the new element. - **********************************************************************/ - -inline void ELIST2_ITERATOR::add_before_stay_put( // element to add - ELIST2_LINK *new_element) { + current = new_element; + } // move to new + /*********************************************************************** + * ELIST2_ITERATOR::add_before_stay_put + * + * Add a new element to the list before the current element but don't move the + * iterator to the new element. + **********************************************************************/ + void add_before_stay_put( // add before current & + T *new_element) { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST2_ITERATOR::add_before_stay_put"); - } - if (!new_element) { + } + if (!new_element) { BAD_PARAMETER.abort("ELIST2_ITERATOR::add_before_stay_put", "new_element is nullptr"); - } - if (new_element->next) { + } + if (new_element->next) { STILL_LINKED.abort("ELIST2_ITERATOR::add_before_stay_put"); - } + } #endif - if (list->empty()) { - new_element->next = new_element; - new_element->prev = new_element; - list->last = new_element; - prev = next = new_element; - ex_current_was_last = true; - current = nullptr; - } else { - prev->next = new_element; - new_element->prev = prev; - - if (current) { // not extracted - new_element->next = current; - current->prev = new_element; - if (next == current) { - next = new_element; - } - } else { // current extracted - new_element->next = next; - next->prev = new_element; - if (ex_current_was_last) { + if (list->empty()) { + new_element->next = new_element; + new_element->prev = new_element; list->last = new_element; + prev = next = new_element; + ex_current_was_last = true; + current = nullptr; + } else { + prev->next = new_element; + new_element->prev = prev; + + if (current) { // not extracted + new_element->next = current; + current->prev = new_element; + if (next == current) { + next = new_element; + } + } else { // current extracted + new_element->next = next; + next->prev = new_element; + if (ex_current_was_last) { + list->last = new_element; + } + } + prev = new_element; } - } - prev = new_element; - } -} - -/*********************************************************************** - * ELIST2_ITERATOR::add_list_after - * - * Insert another list to this list after the current element but don't move - *the - * iterator. - **********************************************************************/ - -inline void ELIST2_ITERATOR::add_list_after(ELIST2 *list_to_add) { + } // stay at current + /*********************************************************************** + * ELIST2_ITERATOR::add_list_after + * + * Insert another list to this list after the current element but don't move + *the + * iterator. + **********************************************************************/ + void add_list_after( // add a list & + IntrusiveList *list_to_add) { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST2_ITERATOR::add_list_after"); - } - if (!list_to_add) { + } + if (!list_to_add) { BAD_PARAMETER.abort("ELIST2_ITERATOR::add_list_after", "list_to_add is nullptr"); - } + } #endif - if (!list_to_add->empty()) { - if (list->empty()) { - list->last = list_to_add->last; - prev = list->last; - next = list->First(); - ex_current_was_last = true; - current = nullptr; - } else { - if (current) { // not extracted - current->next = list_to_add->First(); + if (!list_to_add->empty()) { + if (list->empty()) { + list->last = list_to_add->last; + prev = list->last; + next = list->First(); + ex_current_was_last = true; + current = nullptr; + } else { + if (current) { // not extracted + current->next = list_to_add->First(); #ifndef NDEBUG // V522 There might be dereferencing of a potential null pointer 'current->next'. elst2.h 522 if (!current->next) { BAD_PARAMETER.abort("ELIST2_ITERATOR::add_list_after", "current->next is nullptr"); } #endif - current->next->prev = current; - if (current == list->last) { - list->last = list_to_add->last; - } - list_to_add->last->next = next; - next->prev = list_to_add->last; - next = current->next; - } else { // current extracted - prev->next = list_to_add->First(); + current->next->prev = current; + if (current == list->last) { + list->last = list_to_add->last; + } + list_to_add->last->next = next; + next->prev = list_to_add->last; + next = current->next; + } else { // current extracted + prev->next = list_to_add->First(); #ifndef NDEBUG // V522 There might be dereferencing of a potential null pointer 'prev->next'. elst2.h 531 if (!prev->next) { BAD_PARAMETER.abort("ELIST2_ITERATOR::add_list_after", "prev->next is nullptr"); } #endif - prev->next->prev = prev; - if (ex_current_was_last) { - list->last = list_to_add->last; - ex_current_was_last = false; + prev->next->prev = prev; + if (ex_current_was_last) { + list->last = list_to_add->last; + ex_current_was_last = false; + } + list_to_add->last->next = next; + next->prev = list_to_add->last; + next = prev->next; + } } - list_to_add->last->next = next; - next->prev = list_to_add->last; - next = prev->next; + list_to_add->last = nullptr; } - } - list_to_add->last = nullptr; - } -} - -/*********************************************************************** - * ELIST2_ITERATOR::add_list_before - * - * Insert another list to this list before the current element. Move the - * iterator to the start of the inserted elements - * iterator. - **********************************************************************/ - -inline void ELIST2_ITERATOR::add_list_before(ELIST2 *list_to_add) { + } // stay at current + /*********************************************************************** + * ELIST2_ITERATOR::add_list_before + * + * Insert another list to this list before the current element. Move the + * iterator to the start of the inserted elements + * iterator. + **********************************************************************/ + void add_list_before( // add a list & + IntrusiveList *list_to_add) { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST2_ITERATOR::add_list_before"); - } - if (!list_to_add) { + } + if (!list_to_add) { BAD_PARAMETER.abort("ELIST2_ITERATOR::add_list_before", "list_to_add is nullptr"); - } + } #endif - if (!list_to_add->empty()) { - if (list->empty()) { - list->last = list_to_add->last; - prev = list->last; - current = list->First(); + if (!list_to_add->empty()) { + if (list->empty()) { + list->last = list_to_add->last; + prev = list->last; + current = list->First(); #ifndef NDEBUG // V522 There might be dereferencing of a potential null pointer 'current'. elst2.h 568 if (!current) { BAD_PARAMETER.abort("ELIST2_ITERATOR::add_list_before", "current is nullptr"); } #endif - next = current->next; - ex_current_was_last = false; - } else { - prev->next = list_to_add->First(); + next = current->next; + ex_current_was_last = false; + } else { + prev->next = list_to_add->First(); #ifndef NDEBUG // V522 There might be dereferencing of a potential null pointer 'prev->next'. elst2.h 572 if (!prev->next) { BAD_PARAMETER.abort("ELIST2_ITERATOR::add_list_before", "prev->next is nullptr"); } #endif - prev->next->prev = prev; + prev->next->prev = prev; + + if (current) { // not extracted + list_to_add->last->next = current; + current->prev = list_to_add->last; + } else { // current extracted + list_to_add->last->next = next; + next->prev = list_to_add->last; + if (ex_current_was_last) { + list->last = list_to_add->last; + } + if (ex_current_was_cycle_pt) { + cycle_pt = prev->next; + } + } + current = prev->next; + next = current->next; + } + list_to_add->last = nullptr; + } + } // move to it 1st item + + T *data() { // get current data +#ifndef NDEBUG + if (!current) { + NULL_DATA.error("ELIST2_ITERATOR::data", ABORT); + } + if (!list) { + NO_LIST.error("ELIST2_ITERATOR::data", ABORT); + } +#endif + return current; + } + /*********************************************************************** + * ELIST2_ITERATOR::data_relative + * + * Return the data pointer to the element "offset" elements from current. + * (This function can't be INLINEd because it contains a loop) + **********************************************************************/ + T *data_relative( // get data + or - ... + int8_t offset) { // offset from current + T *ptr; - if (current) { // not extracted - list_to_add->last->next = current; - current->prev = list_to_add->last; - } else { // current extracted - list_to_add->last->next = next; - next->prev = list_to_add->last; - if (ex_current_was_last) { - list->last = list_to_add->last; +#ifndef NDEBUG + if (!list) + NO_LIST.error("ELIST2_ITERATOR::data_relative", ABORT); + if (list->empty()) + EMPTY_LIST.error("ELIST2_ITERATOR::data_relative", ABORT); +#endif + + if (offset < 0) { + for (ptr = current ? current : next; offset++ < 0; ptr = ptr->prev) { + ; + } + } else { + for (ptr = current ? current : prev; offset-- > 0; ptr = ptr->next) { + ; } + } + +#ifndef NDEBUG + if (!ptr) + NULL_DATA.error("ELIST2_ITERATOR::data_relative", ABORT); +#endif + + return ptr; + } // offset from current + /*********************************************************************** + * ELIST2_ITERATOR::forward + * + * Move the iterator to the next element of the list. + * REMEMBER: ALL LISTS ARE CIRCULAR. + **********************************************************************/ + T *forward() { +#ifndef NDEBUG + if (!list) + NO_LIST.error("ELIST2_ITERATOR::forward", ABORT); +#endif + if (list->empty()) { + return nullptr; + } + + if (current) { // not removed so + // set previous + prev = current; + started_cycling = true; + // In case next is deleted by another iterator, get it from the current. + current = current->next; + } else { if (ex_current_was_cycle_pt) { - cycle_pt = prev->next; + cycle_pt = next; } + current = next; } - current = prev->next; + +#ifndef NDEBUG + if (!current) + NULL_DATA.error("ELIST2_ITERATOR::forward", ABORT); +#endif + next = current->next; - } - list_to_add->last = nullptr; - } -} -/*********************************************************************** - * ELIST2_ITERATOR::extract - * - * Do extraction by removing current from the list, returning it to the - * caller, but NOT updating the iterator. (So that any calling loop can do - * this.) The iterator's current points to nullptr. If the extracted element - * is to be deleted, this is the callers responsibility. - **********************************************************************/ +#ifndef NDEBUG + if (!next) { + NULL_NEXT.error("ELIST2_ITERATOR::forward", ABORT, + "This is: %p Current is: %p", + static_cast(this), + static_cast(current)); + } +#endif + + return current; + } // move to next element + /*********************************************************************** + * ELIST2_ITERATOR::backward + * + * Move the iterator to the previous element of the list. + * REMEMBER: ALL LISTS ARE CIRCULAR. + **********************************************************************/ + T *backward() { +#ifndef NDEBUG + if (!list) + NO_LIST.error("ELIST2_ITERATOR::backward", ABORT); +#endif + if (list->empty()) { + return nullptr; + } + + if (current) { // not removed so + // set previous + next = current; + started_cycling = true; + // In case prev is deleted by another iterator, get it from current. + current = current->prev; + } else { + if (ex_current_was_cycle_pt) { + cycle_pt = prev; + } + current = prev; + } -inline ELIST2_LINK *ELIST2_ITERATOR::extract() { - ELIST2_LINK *extracted_link; +#ifndef NDEBUG + if (!current) + NULL_DATA.error("ELIST2_ITERATOR::backward", ABORT); + if (!prev) { + NULL_PREV.error("ELIST2_ITERATOR::backward", ABORT, + "This is: %p Current is: %p", + static_cast(this), + static_cast(current)); + } +#endif + + prev = current->prev; + return current; + } // move to prev element + /*********************************************************************** + * ELIST2_ITERATOR::extract + * + * Do extraction by removing current from the list, returning it to the + * caller, but NOT updating the iterator. (So that any calling loop can do + * this.) The iterator's current points to nullptr. If the extracted element + * is to be deleted, this is the callers responsibility. + **********************************************************************/ + T *extract() { + T *extracted_link; #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST2_ITERATOR::extract"); - } - if (!current) { // list empty or - // element extracted + } + if (!current) { // list empty or + // element extracted NULL_CURRENT.abort("ELIST2_ITERATOR::extract"); - } + } #endif - if (list->singleton()) { - // Special case where we do need to change the iterator. - prev = next = list->last = nullptr; - } else { - prev->next = next; // remove from list - next->prev = prev; - - if (current == list->last) { - list->last = prev; - ex_current_was_last = true; - } else { - ex_current_was_last = false; - } - } - // Always set ex_current_was_cycle_pt so an add/forward will work in a loop. - ex_current_was_cycle_pt = (current == cycle_pt); - extracted_link = current; - extracted_link->next = nullptr; // for safety - extracted_link->prev = nullptr; // for safety - current = nullptr; - return extracted_link; -} - -/*********************************************************************** - * ELIST2_ITERATOR::move_to_first() - * - * Move current so that it is set to the start of the list. - * Return data just in case anyone wants it. - **********************************************************************/ + if (list->singleton()) { + // Special case where we do need to change the iterator. + prev = next = list->last = nullptr; + } else { + prev->next = next; // remove from list + next->prev = prev; -inline ELIST2_LINK *ELIST2_ITERATOR::move_to_first() { + if (current == list->last) { + list->last = prev; + ex_current_was_last = true; + } else { + ex_current_was_last = false; + } + } + // Always set ex_current_was_cycle_pt so an add/forward will work in a loop. + ex_current_was_cycle_pt = (current == cycle_pt); + extracted_link = current; + extracted_link->next = nullptr; // for safety + extracted_link->prev = nullptr; // for safety + current = nullptr; + return extracted_link; + } // remove from list + /*********************************************************************** + * ELIST2_ITERATOR::move_to_first() + * + * Move current so that it is set to the start of the list. + * Return data just in case anyone wants it. + **********************************************************************/ + // go to start of list + T *move_to_first() { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST2_ITERATOR::move_to_first"); - } + } #endif - current = list->First(); - prev = list->last; - next = current ? current->next : nullptr; - return current; -} - -/*********************************************************************** - * ELIST2_ITERATOR::move_to_last() - * - * Move current so that it is set to the end of the list. - * Return data just in case anyone wants it. - **********************************************************************/ - -inline ELIST2_LINK *ELIST2_ITERATOR::move_to_last() { + current = list->First(); + prev = list->last; + next = current ? current->next : nullptr; + return current; + } + /*********************************************************************** + * ELIST2_ITERATOR::move_to_last() + * + * Move current so that it is set to the end of the list. + * Return data just in case anyone wants it. + **********************************************************************/ + T *move_to_last() { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST2_ITERATOR::move_to_last"); - } + } #endif - current = list->last; - prev = current ? current->prev : nullptr; - next = current ? current->next : nullptr; - return current; -} - -/*********************************************************************** - * ELIST2_ITERATOR::mark_cycle_pt() - * - * Remember the current location so that we can tell whether we've returned - * to this point later. - * - * If the current point is deleted either now, or in the future, the cycle - * point will be set to the next item which is set to current. This could be - * by a forward, add_after_then_move or add_after_then_move. - **********************************************************************/ - -inline void ELIST2_ITERATOR::mark_cycle_pt() { + current = list->last; + prev = current ? current->prev : nullptr; + next = current ? current->next : nullptr; + return current; + } // go to end of list + /*********************************************************************** + * ELIST2_ITERATOR::mark_cycle_pt() + * + * Remember the current location so that we can tell whether we've returned + * to this point later. + * + * If the current point is deleted either now, or in the future, the cycle + * point will be set to the next item which is set to current. This could be + * by a forward, add_after_then_move or add_after_then_move. + **********************************************************************/ + void mark_cycle_pt() { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST2_ITERATOR::mark_cycle_pt"); - } + } #endif - if (current) { - cycle_pt = current; - } else { - ex_current_was_cycle_pt = true; - } - started_cycling = false; -} + if (current) { + cycle_pt = current; + } else { + ex_current_was_cycle_pt = true; + } + started_cycling = false; + } // remember current -/*********************************************************************** - * ELIST2_ITERATOR::at_first() - * - * Are we at the start of the list? - * - **********************************************************************/ + bool empty() const { // is list empty? +#ifndef NDEBUG + if (!list) { + NO_LIST.error("ELIST2_ITERATOR::empty", ABORT); + } +#endif + return list->empty(); + } -inline bool ELIST2_ITERATOR::at_first() const { + bool current_extracted() const { // current extracted? + return !current; + } + /*********************************************************************** + * ELIST2_ITERATOR::at_first() + * + * Are we at the start of the list? + * + **********************************************************************/ + bool at_first() const { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST2_ITERATOR::at_first"); - } + } #endif - // we're at a deleted - return ((list->empty()) || (current == list->First()) || - ((current == nullptr) && (prev == list->last) && // NON-last pt between - !ex_current_was_last)); // first and last -} - -/*********************************************************************** - * ELIST2_ITERATOR::at_last() - * - * Are we at the end of the list? - * - **********************************************************************/ - -inline bool ELIST2_ITERATOR::at_last() const { + // we're at a deleted + return ((list->empty()) || (current == list->First()) || + ((current == nullptr) && (prev == list->last) && // NON-last pt between + !ex_current_was_last)); // first and last + } // Current is first? + /*********************************************************************** + * ELIST2_ITERATOR::at_last() + * + * Are we at the end of the list? + * + **********************************************************************/ + bool at_last() const { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST2_ITERATOR::at_last"); - } + } #endif - // we're at a deleted - return ((list->empty()) || (current == list->last) || - ((current == nullptr) && (prev == list->last) && // last point between - ex_current_was_last)); // first and last -} - -/*********************************************************************** - * ELIST2_ITERATOR::cycled_list() - * - * Have we returned to the cycle_pt since it was set? - * - **********************************************************************/ - -inline bool ELIST2_ITERATOR::cycled_list() const { + // we're at a deleted + return ((list->empty()) || (current == list->last) || + ((current == nullptr) && (prev == list->last) && // last point between + ex_current_was_last)); // first and last + } // Current is last? + /*********************************************************************** + * ELIST2_ITERATOR::cycled_list() + * + * Have we returned to the cycle_pt since it was set? + * + **********************************************************************/ + bool cycled_list() const { #ifndef NDEBUG - if (!list) { + if (!list) { NO_LIST.abort("ELIST2_ITERATOR::cycled_list"); - } + } #endif - return ((list->empty()) || ((current == cycle_pt) && started_cycling)); -} + return ((list->empty()) || ((current == cycle_pt) && started_cycling)); + } // Completed a cycle? + /*********************************************************************** + * ELIST2_ITERATOR::add_to_end + * + * Add a new element to the end of the list without moving the iterator. + * This is provided because a single linked list cannot move to the last as + * the iterator couldn't set its prev pointer. Adding to the end is + * essential for implementing queues. + **********************************************************************/ + void add_to_end( // add at end & + T *new_element) { +#ifndef NDEBUG + if (!list) { + NO_LIST.abort("ELIST2_ITERATOR::add_to_end"); + } + if (!new_element) { + BAD_PARAMETER.abort("ELIST2_ITERATOR::add_to_end", "new_element is nullptr"); + } + if (new_element->next) { + STILL_LINKED.abort("ELIST2_ITERATOR::add_to_end"); + } +#endif -/*********************************************************************** - * ELIST2_ITERATOR::sort() - * - * Sort the elements of the list, then reposition at the start. - * - **********************************************************************/ + if (this->at_last()) { + this->add_after_stay_put(new_element); + } else { + if (this->at_first()) { + this->add_before_stay_put(new_element); + list->last = new_element; + } else { // Iteratr is elsewhere + new_element->next = list->last->next; + new_element->prev = list->last; + list->last->next->prev = new_element; + list->last->next = new_element; + list->last = new_element; + } + } + } // don't move + /*********************************************************************** + * ELIST2_ITERATOR::exchange() + * + * Given another iterator, whose current element is a different element on + * the same list list OR an element of another list, exchange the two current + * elements. On return, each iterator points to the element which was the + * other iterators current on entry. + * (This function hasn't been in-lined because its a bit big!) + **********************************************************************/ + void exchange( // positions of 2 links + Iterator *other_it) { // other iterator + constexpr ERRCODE DONT_EXCHANGE_DELETED("Can't exchange deleted elements of lists"); + + T *old_current; -inline void ELIST2_ITERATOR::sort( // sort elements - int comparator( // comparison routine - const void *, const void *)) { #ifndef NDEBUG - if (!list) { - NO_LIST.abort("ELIST2_ITERATOR::sort"); - } + if (!list) + NO_LIST.error("ELIST2_ITERATOR::exchange", ABORT); + if (!other_it) + BAD_PARAMETER.error("ELIST2_ITERATOR::exchange", ABORT, "other_it nullptr"); + if (!(other_it->list)) + NO_LIST.error("ELIST2_ITERATOR::exchange", ABORT, "other_it"); #endif - list->sort(comparator); - move_to_first(); -} + /* Do nothing if either list is empty or if both iterators reference the same + link */ -/*********************************************************************** - * ELIST2_ITERATOR::add_to_end - * - * Add a new element to the end of the list without moving the iterator. - * This is provided because a single linked list cannot move to the last as - * the iterator couldn't set its prev pointer. Adding to the end is - * essential for implementing queues. -**********************************************************************/ + if ((list->empty()) || (other_it->list->empty()) || (current == other_it->current)) { + return; + } -inline void ELIST2_ITERATOR::add_to_end( // element to add - ELIST2_LINK *new_element) { + /* Error if either current element is deleted */ + + if (!current || !other_it->current) { + DONT_EXCHANGE_DELETED.error("ELIST2_ITERATOR.exchange", ABORT); + } + + /* Now handle the 4 cases: doubleton list; non-doubleton adjacent elements + (other before this); non-doubleton adjacent elements (this before other); + non-adjacent elements. */ + + // adjacent links + if ((next == other_it->current) || (other_it->next == current)) { + // doubleton list + if ((next == other_it->current) && (other_it->next == current)) { + prev = next = current; + other_it->prev = other_it->next = other_it->current; + } else { // non-doubleton with + // adjacent links + // other before this + if (other_it->next == current) { + other_it->prev->next = current; + other_it->current->next = next; + other_it->current->prev = current; + current->next = other_it->current; + current->prev = other_it->prev; + next->prev = other_it->current; + + other_it->next = other_it->current; + prev = current; + } else { // this before other + prev->next = other_it->current; + current->next = other_it->next; + current->prev = other_it->current; + other_it->current->next = current; + other_it->current->prev = prev; + other_it->next->prev = current; + + next = current; + other_it->prev = other_it->current; + } + } + } else { // no overlap + prev->next = other_it->current; + current->next = other_it->next; + current->prev = other_it->prev; + next->prev = other_it->current; + other_it->prev->next = current; + other_it->current->next = next; + other_it->current->prev = prev; + other_it->next->prev = current; + } + + /* update end of list pointer when necessary (remember that the 2 iterators + may iterate over different lists!) */ + + if (list->last == current) { + list->last = other_it->current; + } + if (other_it->list->last == other_it->current) { + other_it->list->last = current; + } + + if (current == cycle_pt) { + cycle_pt = other_it->cycle_pt; + } + if (other_it->current == other_it->cycle_pt) { + other_it->cycle_pt = cycle_pt; + } + + /* The actual exchange - in all cases*/ + + old_current = current; + current = other_it->current; + other_it->current = old_current; + } // other iterator + + //# elements in list + int32_t length() const { + return list->length(); + } + /*********************************************************************** + * ELIST2_ITERATOR::sort() + * + * Sort the elements of the list, then reposition at the start. + * + **********************************************************************/ + void sort( // sort elements + int comparator( // comparison routine + const T *, const T *)) { #ifndef NDEBUG - if (!list) { - NO_LIST.abort("ELIST2_ITERATOR::add_to_end"); + if (!list) { + NO_LIST.error("ELIST2_ITERATOR::sort", ABORT); + } +#endif + + list->sort(comparator); + move_to_first(); + } + + private: + // Don't use the following constructor. + Iterator() = delete; + }; + using ITERATOR = Iterator; // compat + +private: + T *last = nullptr; // End of list + //(Points to head) + T *First() { // return first + return last ? last->next : nullptr; } - if (!new_element) { - BAD_PARAMETER.abort("ELIST2_ITERATOR::add_to_end", "new_element is nullptr"); + +public: + ~IntrusiveList() { + clear(); } - if (new_element->next) { - STILL_LINKED.abort("ELIST2_ITERATOR::add_to_end"); + + /* delete elements */ + void clear() { + internal_clear(); } -#endif - if (this->at_last()) { - this->add_after_stay_put(new_element); - } else { - if (this->at_first()) { - this->add_before_stay_put(new_element); - list->last = new_element; - } else { // Iteratr is elsewhere - new_element->next = list->last->next; - new_element->prev = list->last; - list->last->next->prev = new_element; - list->last->next = new_element; - list->last = new_element; + /* Become a deep copy of src_list */ + template + void deep_copy(const U *src_list, T *(*copier)(const T *)) { + Iterator from_it(const_cast(src_list)); + Iterator to_it(this); + + for (from_it.mark_cycle_pt(); !from_it.cycled_list(); from_it.forward()) + to_it.add_after_then_move((*copier)(from_it.data())); + } + + /*********************************************************************** + * IntrusiveList::internal_clear + * + * Used by the destructor and the "clear" member function of derived list + * classes to destroy all the elements on the list. + * The calling function passes a "zapper" function which can be called to + * delete each element of the list, regardless of its derived type. This + * technique permits a generic clear function to destroy elements of + * different derived types correctly, without requiring virtual functions and + * the consequential memory overhead. + **********************************************************************/ + + // destroy all links + void internal_clear() { + // ptr to zapper functn + T *ptr; + T *next; + + if (!empty()) { + ptr = last->next; // set to first + last->next = nullptr; // break circle + last = nullptr; // set list empty + while (ptr) { + next = ptr->next; + delete ptr; + ptr = next; + } } } -} - -#define ELIST2IZEH(CLASSNAME) \ - class CLASSNAME##_LIST : public X_LIST { \ - using X_LIST::X_LIST; \ - }; \ - struct CLASSNAME##_IT : X_ITER { \ - using X_ITER::X_ITER; \ - CLASSNAME *backward() { \ - return reinterpret_cast(ELIST2_ITERATOR::backward()); \ - } \ + + bool empty() const { // is list empty? + return !last; + } + + bool singleton() const { + return last ? (last == last->next) : false; + } + + void shallow_copy( // dangerous!! + IntrusiveList *from_list) { // beware destructors!! + last = from_list->last; + } + + /*********************************************************************** + * IntrusiveList::assign_to_sublist + * + * The list is set to a sublist of another list. "This" list must be empty + * before this function is invoked. The two iterators passed must refer to + * the same list, different from "this" one. The sublist removed is the + * inclusive list from start_it's current position to end_it's current + * position. If this range passes over the end of the source list then the + * source list has its end set to the previous element of start_it. The + * extracted sublist is unaffected by the end point of the source list, its + * end point is always the end_it position. + **********************************************************************/ + void assign_to_sublist( // to this list + Iterator *start_it, // from list start + Iterator *end_it); // from list end + + // # elements in list + int32_t length() const { + int32_t count = 0; + if (last != nullptr) { + count = 1; + for (auto it = last->next; it != last; it = it->next) { + count++; + } + } + return count; + } + /*********************************************************************** + * IntrusiveList::sort + * + * Sort elements on list + * NB If you don't like the const declarations in the comparator, coerce yours: + * (int (*)(const void *, const void *) + **********************************************************************/ + void sort( // sort elements + int comparator( // comparison routine + const T *, const T *)) { + // Allocate an array of pointers, one per list element. + auto count = length(); + if (count > 0) { + // ptr array to sort + std::vector base; + base.reserve(count); + + Iterator it(this); + + // Extract all elements, putting the pointers in the array. + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + base.push_back(it.extract()); + } + + // Sort the pointer array. + std::sort(base.begin(), base.end(), + // all current comparators return -1,0,1, so we handle this correctly for std::sort + [&](auto &&l, auto &&r) {return comparator(l, r) < 0; }); + + // Rebuild the list from the sorted pointers. + for (auto current : base) { + it.add_to_end(current); + } + } + } + + // Assuming list has been sorted already, insert new_link to + // keep the list sorted according to the same comparison function. + // Comparison function is the same as used by sort, i.e. uses double + // indirection. Time is O(1) to add to beginning or end. + // Time is linear to add pre-sorted items to an empty list. + void add_sorted(int comparator(const T *, const T *), T *new_link) { + // Check for adding at the end. + if (last == nullptr || comparator(last, new_link) < 0) { + if (last == nullptr) { + new_link->next = new_link; + new_link->prev = new_link; + } else { + new_link->next = last->next; + new_link->prev = last; + last->next = new_link; + new_link->next->prev = new_link; + } + last = new_link; + } else { + // Need to use an iterator. + Iterator it(this); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + auto link = it.data(); + if (comparator(link, new_link) > 0) { + break; + } + } + if (it.cycled_list()) { + it.add_to_end(new_link); + } else { + it.add_before_then_move(new_link); + } + } + } +}; + +template +using ELIST2 = IntrusiveList; + +// add TESS_API? +// move templated lists to public include dirs? +#define ELIST2IZEH(T) \ + class T##_LIST : public IntrusiveList { \ + public: \ + using IntrusiveList::IntrusiveList; \ + }; \ + class T##_IT : public IntrusiveList::Iterator { \ + public: \ + using base = IntrusiveList::Iterator; \ + using base::base; \ } } // namespace tesseract diff --git a/src/ccutil/global_params.h b/src/ccutil/global_params.h index 344d9dca4c..c3388a4b96 100644 --- a/src/ccutil/global_params.h +++ b/src/ccutil/global_params.h @@ -19,6 +19,7 @@ #define TESS_GLOBAL_PARAMS_H #include +#include // for std::forward namespace tesseract { @@ -39,6 +40,12 @@ extern BOOL_VAR_H(report_all_variables); extern DOUBLE_VAR_H(allowed_image_memory_capacity); extern BOOL_VAR_H(two_pass); +// Disable some log messages by setting log_level > 0. +extern TESS_API INT_VAR_H(log_level); + +// Get file for debug output. +TESS_API FILE *get_debugfp(); + } // namespace tesseract #endif diff --git a/src/ccutil/helpers.h b/src/ccutil/helpers.h index f89b7edf2a..712bbae034 100644 --- a/src/ccutil/helpers.h +++ b/src/ccutil/helpers.h @@ -58,10 +58,7 @@ #include // for INT_MIN, INT_MAX #include // std::isfinite #include -#include #include // for std::find -#include -#include #include #include #include @@ -140,22 +137,21 @@ inline const std::vector split(const std::string &s, char c) { return v; } -// A simple linear congruential random number generator. +// A simple linear congruential random number generator, +// using Knuth's constants from: +// http://en.wikipedia.org/wiki/Linear_congruential_generator. class TRand { public: + TRand() = default; // Sets the seed to the given value. void set_seed(uint64_t seed) { - e.seed(seed); - } - // Sets the seed using a hash of a string. - void set_seed(const std::string &str) { - std::hash hasher; - set_seed(static_cast(hasher(str))); + seed_ = seed; } // Returns an integer in the range 0 to INT32_MAX. int32_t IntRand() { - return e(); + Iterate(); + return seed_ >> 33; } // Returns a floating point value in the range [-range, range]. double SignedRand(double range) { @@ -167,7 +163,14 @@ class TRand { } private: - std::minstd_rand e; + // Steps the generator to the next value. + void Iterate() { + seed_ *= 6364136223846793005ULL; + seed_ += 1442695040888963407ULL; + } + + // The current value of the seed. + uint64_t seed_{1}; }; // Remove newline (if any) at the end of the string. diff --git a/src/ccutil/list.h b/src/ccutil/list.h deleted file mode 100644 index 277b250995..0000000000 --- a/src/ccutil/list.h +++ /dev/null @@ -1,71 +0,0 @@ -/********************************************************************** - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#ifndef LIST_ITERATOR_H -#define LIST_ITERATOR_H - -#include - -namespace tesseract { - -template -class X_ITER : public ITERATOR { -public: - X_ITER() = default; - - template - X_ITER(U *list) : ITERATOR(list) {} - - CLASSNAME *data() { - return static_cast(ITERATOR::data()); - } - CLASSNAME *data_relative(int8_t offset) { - return static_cast(ITERATOR::data_relative(offset)); - } - CLASSNAME *forward() { - return static_cast(ITERATOR::forward()); - } - CLASSNAME *extract() { - return static_cast(ITERATOR::extract()); - } -}; - -template -class X_LIST : public CONTAINER { -public: - X_LIST() = default; - X_LIST(const X_LIST &) = delete; - X_LIST &operator=(const X_LIST &) = delete; - ~X_LIST() { - clear(); - } - - /* delete elements */ - void clear() { - CONTAINER::internal_clear([](void *link) {delete reinterpret_cast(link);}); - } - - /* Become a deep copy of src_list */ - template - void deep_copy(const U *src_list, CLASSNAME *(*copier)(const CLASSNAME *)) { - X_ITER from_it(const_cast(src_list)); - X_ITER to_it(this); - - for (from_it.mark_cycle_pt(); !from_it.cycled_list(); from_it.forward()) - to_it.add_after_then_move((*copier)(from_it.data())); - } -}; - -} // namespace tesseract - -#endif diff --git a/src/ccutil/tessdatamanager.h b/src/ccutil/tessdatamanager.h index 4677ea2148..648b5d22db 100644 --- a/src/ccutil/tessdatamanager.h +++ b/src/ccutil/tessdatamanager.h @@ -24,8 +24,6 @@ #include // std::vector #include "serialis.h" // FileWriter -static const char kTrainedDataSuffix[] = "traineddata"; - // When adding new tessdata types and file suffixes, please make sure to // update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText. static const char kLangConfigFileSuffix[] = "config"; diff --git a/src/ccutil/tesserrstream.h b/src/ccutil/tesserrstream.h new file mode 100644 index 0000000000..6ad1506fe7 --- /dev/null +++ b/src/ccutil/tesserrstream.h @@ -0,0 +1,72 @@ +// File: tesserrstream.h +// Description: C++ stream which enhances tprintf +// Author: Stefan Weil +// +// (C) Copyright 2024 +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef TESSERACT_CCUTIL_TESSERRSTREAM_H +#define TESSERACT_CCUTIL_TESSERRSTREAM_H + +#if 0 + +#include +#include // for TESS_API + +#include // for std::ostream + +namespace tesseract { + +class TessStreamBuf : public std::streambuf { +public: + TessStreamBuf() = default; + +protected: + virtual int_type overflow(int_type c) override { + if (c != EOF) { + if (debugfp == nullptr) { + debugfp = get_debugfp(); + } + if (fputc(c, debugfp) == EOF) { + return EOF; + } + } + return c; + } + + virtual std::streamsize xsputn(const char* s, std::streamsize n) override { + if (debugfp == nullptr) { + debugfp = get_debugfp(); + } + return fwrite(s, 1, n, debugfp); + } + +private: + FILE *debugfp = nullptr; +}; + +class TessErrStream : public std::ostream { +private: + TessStreamBuf buf; + +public: + TessErrStream() : std::ostream(nullptr), buf() { + rdbuf(&buf); + } +}; + +extern TESS_API TessErrStream tesserr; + +} // namespace tesseract + +#endif + +#endif // TESSERACT_CCUTIL_TESSERRSTREAM_H diff --git a/src/ccutil/tprintf.cpp b/src/ccutil/tprintf.cpp index 9d1d51fa0f..bb70ccb69a 100644 --- a/src/ccutil/tprintf.cpp +++ b/src/ccutil/tprintf.cpp @@ -19,6 +19,7 @@ // Include automatically generated configuration file if running autoconf. #include // compiler config, etc. +#include "tesserrstream.h" #include #include @@ -113,6 +114,7 @@ static void do_transmit_logline() { TPrintGroupLinesTillEndOfScope::TPrintGroupLinesTillEndOfScope() { pending_grouping_count++; } + // pop pending grouping signal TPrintGroupLinesTillEndOfScope::~TPrintGroupLinesTillEndOfScope() { // once we get here, a spurious higher level log message may have broken up @@ -148,7 +150,7 @@ static void fz_tess_tprintf(int level, fmt::string_view format, fmt::format_args if (!msg_buffer.empty()) { if (!msg_buffer.ends_with('\n')) msg_buffer += '\n'; - // send the lower prio message before continuing with our intermittant + // send the lower prio message before continuing with our intermittent // higher prio current message: do_transmit_logline(); } @@ -342,13 +344,15 @@ void vTessPrint(int level, fmt::string_view format, fmt::format_args args) { } #if defined(WIN32) || defined(_WIN32) || defined(_WIN64) - // Replace /dev/null by nil for Windows. + // Replace /dev/null by nul for Windows. if (strcmp(debug_file_name, "/dev/null") == 0) { - debug_file_name = ""; + debug_file_name = "nul"; debug_file.set_value(debug_file_name); } #endif + XXXXX TODO: handle null, stderr, stdout + if (debugfp == nullptr && debug_file_name[0] != '\0') { debugfp = fopen(debug_file_name, "a+b"); } else if (debugfp != nullptr && debug_file_name[0] == '\0') { @@ -364,4 +368,8 @@ void vTessPrint(int level, fmt::string_view format, fmt::format_args args) { #endif } +#if 0 +TessErrStream tesserr; +#endif + } // namespace tesseract diff --git a/src/ccutil/unicity_table.h b/src/ccutil/unicity_table.h index 54f740a3b3..905d34cce8 100644 --- a/src/ccutil/unicity_table.h +++ b/src/ccutil/unicity_table.h @@ -80,7 +80,7 @@ class UnicityTable { int push_back(T object) { auto idx = get_index(object); if (idx == -1) { - idx = table_.push_back(object); + idx = table_.push_back(std::move(object)); } return idx; } diff --git a/src/classify/adaptive.cpp b/src/classify/adaptive.cpp index 64e4b5f4e7..158e81f515 100644 --- a/src/classify/adaptive.cpp +++ b/src/classify/adaptive.cpp @@ -63,13 +63,12 @@ PERM_CONFIG_STRUCT::~PERM_CONFIG_STRUCT() { delete[] Ambigs; } -ADAPT_CLASS_STRUCT::ADAPT_CLASS_STRUCT() { - NumPermConfigs = 0; - MaxNumTimesSeen = 0; - TempProtos = NIL_LIST; - - PermProtos = NewBitVector(MAX_NUM_PROTOS); - PermConfigs = NewBitVector(MAX_NUM_CONFIGS); +ADAPT_CLASS_STRUCT::ADAPT_CLASS_STRUCT() : + NumPermConfigs(0), + MaxNumTimesSeen(0), + PermProtos(NewBitVector(MAX_NUM_PROTOS)), + PermConfigs(NewBitVector(MAX_NUM_CONFIGS)), + TempProtos(NIL_LIST) { zero_all_bits(PermProtos, WordsInVectorOfSize(MAX_NUM_PROTOS)); zero_all_bits(PermConfigs, WordsInVectorOfSize(MAX_NUM_CONFIGS)); @@ -128,16 +127,13 @@ int Classify::GetFontinfoId(ADAPT_CLASS_STRUCT *Class, uint8_t ConfigId) { /// /// @param MaxProtoId max id of any proto in new config /// @param FontinfoId font information from pre-trained templates -TEMP_CONFIG_STRUCT::TEMP_CONFIG_STRUCT(int maxProtoId, int fontinfoId) { - int NumProtos = maxProtoId + 1; - - Protos = NewBitVector(NumProtos); - - NumTimesSeen = 1; - MaxProtoId = maxProtoId; - ProtoVectorSize = WordsInVectorOfSize(NumProtos); +TEMP_CONFIG_STRUCT::TEMP_CONFIG_STRUCT(int maxProtoId, int fontinfoId) : + NumTimesSeen(1), + ProtoVectorSize(WordsInVectorOfSize(maxProtoId + 1)), + MaxProtoId(maxProtoId), + Protos(NewBitVector(maxProtoId + 1)), + FontinfoId(fontinfoId) { zero_all_bits(Protos, ProtoVectorSize); - FontinfoId = fontinfoId; } TEMP_CONFIG_STRUCT::~TEMP_CONFIG_STRUCT() { diff --git a/src/classify/adaptmatch.cpp b/src/classify/adaptmatch.cpp index bd620cb2bb..408d826d27 100644 --- a/src/classify/adaptmatch.cpp +++ b/src/classify/adaptmatch.cpp @@ -656,7 +656,7 @@ void Classify::StartBackupAdaptiveClassifier() { void Classify::SetupPass1() { EnableLearning = classify_enable_learning; UseLearning = false; - getDict().SettupStopperPass1(); + getDict().SetupStopperPass1(); } /* SetupPass1 */ @@ -672,7 +672,7 @@ void Classify::SetupPass1() { void Classify::SetupPass2() { EnableLearning = false; UseLearning = true; - getDict().SettupStopperPass2(); + getDict().SetupStopperPass2(); } /* SetupPass2 */ diff --git a/src/classify/clusttool.cpp b/src/classify/clusttool.cpp index 9befb442cf..3e2a862f74 100644 --- a/src/classify/clusttool.cpp +++ b/src/classify/clusttool.cpp @@ -277,7 +277,7 @@ void WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto) { fprintf(File, "insignificant "); } WriteProtoStyle(File, static_cast(Proto->Style)); - fprintf(File, "%6d\n\t", Proto->NumSamples); + fprintf(File, "%6u\n\t", Proto->NumSamples); WriteNFloats(File, N, &Proto->Mean[0]); fprintf(File, "\t"); diff --git a/src/classify/intproto.cpp b/src/classify/intproto.cpp index 23704a7a2f..4efaf59a3a 100644 --- a/src/classify/intproto.cpp +++ b/src/classify/intproto.cpp @@ -18,7 +18,9 @@ Include Files and Type Defines -----------------------------------------------------------------------------*/ +#ifndef _USE_MATH_DEFINES #define _USE_MATH_DEFINES // for M_PI +#endif // Include automatically generated configuration file if running autoconf. #include // compiler config, etc. @@ -520,7 +522,7 @@ INT_TEMPLATES_STRUCT *Classify::CreateIntTemplates(CLASSES FloatProtos, for (unsigned i = 0; i < fs_size; ++i) { fs.push_back(FClass->font_set[i]); } - IClass->font_set_id = this->fontset_table_.push_back(fs); + IClass->font_set_id = this->fontset_table_.push_back(std::move(fs)); AddIntClass(IntTemplates, ClassId, IClass); for (ProtoId = 0; ProtoId < FClass->NumProtos; ProtoId++) { diff --git a/src/classify/shapetable.h b/src/classify/shapetable.h index 74592d461f..da5bb8be23 100644 --- a/src/classify/shapetable.h +++ b/src/classify/shapetable.h @@ -27,6 +27,7 @@ #include "fontinfo.h" #include "genericheap.h" #include "intmatcher.h" +#include "tesserrstream.h" // for tesserr namespace tesseract { diff --git a/src/classify/tessclassifier.h b/src/classify/tessclassifier.h index ca27945c53..98d97cc947 100644 --- a/src/classify/tessclassifier.h +++ b/src/classify/tessclassifier.h @@ -36,17 +36,17 @@ class TESS_API TessClassifier : public ShapeClassifier { public: TessClassifier(bool pruner_only, tesseract::Classify *classify) : pruner_only_(pruner_only), classify_(classify) {} - ~TessClassifier() override = default; + virtual ~TessClassifier() override = default; // Classifies the given [training] sample, writing to results. // See ShapeClassifier for a full description. int UnicharClassifySample(const TrainingSample &sample, int debug, UNICHAR_ID keep_this, std::vector *results) override; // Provides access to the ShapeTable that this classifier works with. - const ShapeTable *GetShapeTable() const override; + virtual const ShapeTable *GetShapeTable() const override; // Provides access to the UNICHARSET that this classifier works with. // Only needs to be overridden if GetShapeTable() can return nullptr. - const UNICHARSET &GetUnicharset() const override; + virtual const UNICHARSET &GetUnicharset() const override; // Displays classification as the given shape_id. Creates as many windows // as it feels fit, using index as a guide for placement. Adds any created diff --git a/src/classify/trainingsample.h b/src/classify/trainingsample.h index 8edd1d3c6e..4f3f4dc260 100644 --- a/src/classify/trainingsample.h +++ b/src/classify/trainingsample.h @@ -51,7 +51,7 @@ static const int kSampleScaleSize = 3; static const int kSampleRandomSize = kSampleYShiftSize * kSampleScaleSize - 2; // ASSERT_IS_PRIME(kSampleRandomSize) !! -class TESS_API TrainingSample : public ELIST_LINK { +class TESS_API TrainingSample : public ELIST::LINK { public: TrainingSample() : class_id_(INVALID_UNICHAR_ID) diff --git a/src/dict/dawg.h b/src/dict/dawg.h index 40755f5c32..4093922107 100644 --- a/src/dict/dawg.h +++ b/src/dict/dawg.h @@ -113,7 +113,7 @@ static const char kWildcard[] = "*"; class TESS_API Dawg { public: /// Magic number to determine endianness when reading the Dawg from file. - static const int16_t kDawgMagicNumber = 42; + static constexpr int16_t kDawgMagicNumber = 42; /// A special unichar id that indicates that any appropriate pattern /// (e.g.dictionary word, 0-9 digit, etc) can be inserted instead /// Used for expressing patterns in punctuation and number Dawgs. diff --git a/src/dict/dict.cpp b/src/dict/dict.cpp index a6824fef8c..5596d84b4b 100644 --- a/src/dict/dict.cpp +++ b/src/dict/dict.cpp @@ -20,6 +20,7 @@ #include "dict.h" +#include "tesserrstream.h" // for tesserr #include #include diff --git a/src/dict/dict.h b/src/dict/dict.h index 1b5d8a4297..673efe7e6f 100644 --- a/src/dict/dict.h +++ b/src/dict/dict.h @@ -322,9 +322,9 @@ class TESS_API Dict : public DictSettings { /// Prints the current choices for this word to stdout. void DebugWordChoices(); /// Sets up stopper variables in preparation for the first pass. - void SettupStopperPass1(); + void SetupStopperPass1(); /// Sets up stopper variables in preparation for the second pass. - void SettupStopperPass2(); + void SetupStopperPass2(); /* context.cpp *************************************************************/ /// Check a string to see if it matches a set of lexical rules. bool case_ok(const WERD_CHOICE &word) const; diff --git a/src/dict/stopper.cpp b/src/dict/stopper.cpp index 13023e9a22..2c2a0f0e12 100644 --- a/src/dict/stopper.cpp +++ b/src/dict/stopper.cpp @@ -367,11 +367,11 @@ void Dict::EndDangerousAmbigs() {} #endif // !DISABLED_LEGACY_ENGINE -void Dict::SettupStopperPass1() { +void Dict::SetupStopperPass1() { reject_offset_ = 0.0; } -void Dict::SettupStopperPass2() { +void Dict::SetupStopperPass2() { reject_offset_ = stopper_phase2_certainty_rejection_offset; } diff --git a/src/lstm/functions.h b/src/lstm/functions.h index 1e71b2f277..2fd6f0c978 100644 --- a/src/lstm/functions.h +++ b/src/lstm/functions.h @@ -200,8 +200,9 @@ inline void SoftmaxInPlace(int n, T *inout) { inout[i] = prob; } if (prob_total > 0) { + T inv_prob_total = 1 / prob_total; for (int i = 0; i < n; i++) { - inout[i] /= prob_total; + inout[i] *= inv_prob_total; } } } diff --git a/src/lstm/input.cpp b/src/lstm/input.cpp index 56a0de8cea..d52523ee3f 100644 --- a/src/lstm/input.cpp +++ b/src/lstm/input.cpp @@ -105,8 +105,7 @@ Image Input::PrepareLSTMInputs(const ImageData &image_data, const Network *netwo // Scale to target height, if the shape's height is > 1, or its depth if the // height == 1. If height == 0 then no scaling. // NOTE: It isn't safe for multiple threads to call this on the same pix. -/* static */ -void Input::PreparePixInput(Tesseract *tess, const StaticShape &shape, const Image pix, TRand *randomizer, +void Input::PreparePixInput(Tesseract &tess, const StaticShape &shape, const Image pix, TRand *randomizer, NetworkIO *input, const TBOX &line_box, float scale_factor) { bool color = shape.depth() == 3; Image var_pix = pix; @@ -143,9 +142,9 @@ void Input::PreparePixInput(Tesseract *tess, const StaticShape &shape, const Ima Image scaled_pix = pixScale(normed_pix, im_factor, im_factor); normed_pix = scaled_pix; } - if (tess != nullptr && (verbose_process || tess->tessedit_dump_pageseg_images)) + if (verbose_process || tess.tessedit_dump_pageseg_images) { - tess->AddPixDebugPage(normed_pix, fmt::format("LSTM normed input image: prepare to recognize one line of text. (height:{}, target_height:{}, scale_factor:{}, position box:{})", height, target_height, scale_factor, line_box.print_to_str())); + tess.AddPixDebugPage(normed_pix, fmt::format("LSTM normed input image: prepare to recognize one line of text. (height:{}, target_height:{}, scale_factor:{}, position box:{})", height, target_height, scale_factor, line_box.print_to_str())); } input->FromPix(shape, normed_pix, randomizer); } diff --git a/src/lstm/input.h b/src/lstm/input.h index df3644ef5c..08e1538c7f 100644 --- a/src/lstm/input.h +++ b/src/lstm/input.h @@ -92,7 +92,7 @@ class Input : public Network { // height == 1. If height == 0 then no scaling. // // NOTE: It isn't safe for multiple threads to call this on the same pix. - static void PreparePixInput(Tesseract *tess, const StaticShape &shape, const Image pix, + static void PreparePixInput(Tesseract &tess, const StaticShape &shape, const Image pix, TRand *randomizer, NetworkIO *input, const TBOX &line_box, float scale_factor); diff --git a/src/lstm/lstmrecognizer.cpp b/src/lstm/lstmrecognizer.cpp index 7fbe918fef..398e9de007 100644 --- a/src/lstm/lstmrecognizer.cpp +++ b/src/lstm/lstmrecognizer.cpp @@ -51,7 +51,7 @@ const double kCertOffset = -0.085; // ccutil_.language_data_path_prefix = language_data_path_prefix; //} -LSTMRecognizer::LSTMRecognizer(Tesseract *tess) +LSTMRecognizer::LSTMRecognizer(Tesseract &tess) : network_(nullptr) , training_flags_(0) , training_iteration_(0) @@ -103,7 +103,7 @@ bool LSTMRecognizer::Load(const ParamsVectorSet ¶ms, const std::string &lang return true; } // Allow it to run without a dictionary. - LoadDictionary(params, lang, mgr); + LoadDictionary(lang, mgr); return true; } @@ -156,7 +156,7 @@ bool LSTMRecognizer::DeSerialize(const TessdataManager *mgr, TFile *fp) { } bool include_charsets = mgr == nullptr || !mgr->IsComponentAvailable(TESSDATA_LSTM_RECODER) || !mgr->IsComponentAvailable(TESSDATA_LSTM_UNICHARSET); - if (include_charsets && !ccutil_.unicharset_.load_from_file(fp, false)) { + if (include_charsets && !GetUnicharset().load_from_file(fp, false)) { return false; } if (!fp->DeSerialize(network_str_)) { @@ -200,7 +200,7 @@ bool LSTMRecognizer::LoadCharsets(const TessdataManager *mgr) { if (!mgr->GetComponent(TESSDATA_LSTM_UNICHARSET, &fp)) { return false; } - if (!ccutil_.unicharset_.load_from_file(&fp, false)) { + if (!GetUnicharset().load_from_file(&fp, false)) { return false; } if (!mgr->GetComponent(TESSDATA_LSTM_RECODER, &fp)) { @@ -242,7 +242,8 @@ bool LSTMRecognizer::LoadRecoder(TFile *fp) { bool LSTMRecognizer::LoadDictionary(const ParamsVectorSet ¶ms, const std::string &lang, TessdataManager *mgr) { delete dict_; - dict_ = new Dict(&ccutil_); + dict_ = new Dict(&tesseract_); + ParamsVectors *params = tesseract_.params(); dict_->user_words_file.ResetToDefault(params); dict_->user_words_suffix.ResetToDefault(params); dict_->user_patterns_file.ResetToDefault(params); @@ -252,7 +253,7 @@ bool LSTMRecognizer::LoadDictionary(const ParamsVectorSet ¶ms, const std::st if (dict_->FinishLoad()) { return true; // Success. } - tprintError("Failed to load any lstm-specific dictionaries for lang {}!!\n", lang); + tprintError("Failed to load any LSTM-specific dictionaries for lang {}!!\n", lang); delete dict_; dict_ = nullptr; return false; @@ -661,13 +662,13 @@ const char *LSTMRecognizer::DecodeSingleLabel(int label) { void LSTMRecognizer::SetDataPathPrefix(const std::string &language_data_path_prefix) { - ccutil_.language_data_path_prefix_ = language_data_path_prefix; + tesseract_.language_data_path_prefix_ = language_data_path_prefix; } void LSTMRecognizer::CopyDebugParameters(CCUtil *src, Dict *dict_src) { - if (src != nullptr && &ccutil_ != src) { - ccutil_.ambigs_debug_level = src->ambigs_debug_level.value(); - ccutil_.use_ambigs_for_adaption = src->use_ambigs_for_adaption.value(); + if (src != nullptr && &tesseract_ != src) { + tesseract_.ambigs_debug_level = src->ambigs_debug_level.value(); + tesseract_.use_ambigs_for_adaption = src->use_ambigs_for_adaption.value(); } if (dict_ != nullptr && dict_ != dict_src) { diff --git a/src/lstm/lstmrecognizer.h b/src/lstm/lstmrecognizer.h index 2b13ba6813..5cf28fd96e 100644 --- a/src/lstm/lstmrecognizer.h +++ b/src/lstm/lstmrecognizer.h @@ -28,6 +28,7 @@ #include "series.h" #include "unicharcompress.h" #include "genericvector.h" // for PointerVector (ptr only) +#include "tesseractclass.h" class BLOB_CHOICE_IT; struct Pix; @@ -52,7 +53,7 @@ enum TrainingFlags { class TESS_API LSTMRecognizer { public: // Takes an OPTIONAL instance reference for internal diagnostics use. - LSTMRecognizer(Tesseract *tess); + LSTMRecognizer(Tesseract &tess); LSTMRecognizer() = delete; //LSTMRecognizer(const std::string &language_data_path_prefix); ~LSTMRecognizer(); @@ -192,10 +193,10 @@ class TESS_API LSTMRecognizer { // Provides access to the UNICHARSET that this classifier works with. const UNICHARSET &GetUnicharset() const { - return ccutil_.unicharset_; + return tesseract_.unicharset_; } UNICHARSET &GetUnicharset() { - return ccutil_.unicharset_; + return tesseract_.unicharset_; } // Provides access to the UnicharCompress that this classifier works with. const UnicharCompress &GetRecoder() const { @@ -301,7 +302,7 @@ class TESS_API LSTMRecognizer { protected: // Sets the random seed from the sample_iteration_; void SetRandomSeed() { - int64_t seed = static_cast(sample_iteration_) * 0x10000001; + int64_t seed = sample_iteration_ * 0x10000001LL; randomizer_.set_seed(seed); randomizer_.IntRand(); } @@ -340,14 +341,13 @@ class TESS_API LSTMRecognizer { const char *DecodeSingleLabel(int label); protected: - // OPTIONAL reference to the active Tesseract instance where LSTM/Input + // Reference to the active Tesseract instance where LSTM/Input // internal diagnostics should be sent to. - Tesseract *tesseract_; + // Also provides the unicharset. Only the unicharset element is serialized. + // Has to be a CCUtil deriv class, so Dict can point to it. + Tesseract &tesseract_; // The network hierarchy. Network *network_; - // The unicharset. Only the unicharset element is serialized. - // Has to be a CCUtil, so Dict can point to it. - CCUtil ccutil_; // For backward compatibility, recoder_ is serialized iff // training_flags_ & TF_COMPRESS_UNICHARSET. // Further encode/decode ccutil_.unicharset's ids to simplify the unicharset. diff --git a/src/lstm/network.cpp b/src/lstm/network.cpp index bd44a7f90c..715c186f0d 100644 --- a/src/lstm/network.cpp +++ b/src/lstm/network.cpp @@ -36,9 +36,6 @@ #include "scrollview.h" #include "series.h" #include "statistc.h" -#ifdef INCLUDE_TENSORFLOW -# include "tfnetwork.h" -#endif #include #undef min @@ -294,11 +291,7 @@ Network *Network::CreateFromFile(TFile *fp) { network = new Series(name); break; case NT_TENSORFLOW: -#ifdef INCLUDE_TENSORFLOW - network = new TFNetwork(name); -#else - tprintWarn("TensorFlow not compiled in! -DINCLUDE_TENSORFLOW\n"); -#endif + tprintWarn("Unsupported TensorFlow model\n"); break; // All variants of FullyConnected. case NT_SOFTMAX: diff --git a/src/lstm/networkio.cpp b/src/lstm/networkio.cpp index bc355f2864..83359677de 100644 --- a/src/lstm/networkio.cpp +++ b/src/lstm/networkio.cpp @@ -235,6 +235,7 @@ void NetworkIO::Copy2DImage(int batch, Image pix, float black, float contrast, T int target_width = stride_map_.Size(FD_WIDTH); int num_features = NumFeatures(); bool color = num_features == 3; + float inv_contrast = 1.0f / contrast; if (width > target_width) { width = target_width; } @@ -247,11 +248,11 @@ void NetworkIO::Copy2DImage(int batch, Image pix, float black, float contrast, T int f = 0; for (int c = COLOR_RED; c <= COLOR_BLUE; ++c) { int pixel = GET_DATA_BYTE(line + x, c); - SetPixel(t, f++, pixel, black, contrast); + SetPixel(t, f++, pixel, black, inv_contrast); } } else { int pixel = GET_DATA_BYTE(line, x); - SetPixel(t, 0, pixel, black, contrast); + SetPixel(t, 0, pixel, black, inv_contrast); } } } @@ -275,6 +276,7 @@ void NetworkIO::Copy1DGreyImage(int batch, Image pix, float black, float contras index.AddOffset(batch, FD_BATCH); int t = index.t(); int target_width = stride_map_.Size(FD_WIDTH); + float inv_contrast = 1.0f / contrast; if (width > target_width) { width = target_width; } @@ -283,7 +285,7 @@ void NetworkIO::Copy1DGreyImage(int batch, Image pix, float black, float contras for (int y = 0; y < height; ++y) { uint32_t *line = pixGetData(pix) + wpl * y; int pixel = GET_DATA_BYTE(line, x); - SetPixel(t, y, pixel, black, contrast); + SetPixel(t, y, pixel, black, inv_contrast); } } for (; x < target_width; ++x) { @@ -298,8 +300,9 @@ void NetworkIO::Copy1DGreyImage(int batch, Image pix, float black, float contras // pixel: the value of the pixel from the image (in one channel) // black: the pixel value to map to the lowest of the range of *this // contrast: the range of pixel values to stretch to half the range of *this. -void NetworkIO::SetPixel(int t, int f, int pixel, float black, float contrast) { - float float_pixel = (pixel - black) / contrast - 1.0f; +// inv_contrast: one over the contrast, to save a divide +void NetworkIO::SetPixel(int t, int f, int pixel, float black, float inv_contrast) { + float float_pixel = (pixel - black) * inv_contrast - 1.0f; if (int_mode_) { i_[t][f] = ClipToRange(IntCastRounded((INT8_MAX + 1) * float_pixel), -INT8_MAX, INT8_MAX); } else { @@ -428,7 +431,11 @@ void NetworkIO::Randomize(int t, int offset, int num_features, TRand *randomizer if (int_mode_) { int8_t *line = i_[t] + offset; for (int i = 0; i < num_features; ++i) { +#if 01 line[i] = IntCastRounded(randomizer->SignedRand(INT8_MAX)); +#else + line[i] = 0; +#endif } } else { // float mode. diff --git a/src/lstm/networkio.h b/src/lstm/networkio.h index 317d9b36f4..3d7c785433 100644 --- a/src/lstm/networkio.h +++ b/src/lstm/networkio.h @@ -93,7 +93,8 @@ class TESS_API NetworkIO { // pixel: the value of the pixel from the image (in one channel) // black: the pixel value to map to the lowest of the range of *this // contrast: the range of pixel values to stretch to half the range of *this. - void SetPixel(int t, int f, int pixel, float black, float contrast); + // inv_contrast: one over the contrast, to save a divide + void SetPixel(int t, int f, int pixel, float black, float inv_contrast); // Converts the array to a Pix. Must be pixDestroyed after use. Image ToPix() const; // Prints the first and last num timesteps of the array for each feature. diff --git a/src/lstm/series.cpp b/src/lstm/series.cpp index 38c1cb154f..ce87090bbc 100644 --- a/src/lstm/series.cpp +++ b/src/lstm/series.cpp @@ -22,6 +22,7 @@ #include "fullyconnected.h" #include "networkscratch.h" #include "scrollview.h" +#include "tesserrstream.h" // for tesserr #include namespace tesseract { diff --git a/src/lstm/tfnetwork.cpp b/src/lstm/tfnetwork.cpp deleted file mode 100644 index 78aed0b94b..0000000000 --- a/src/lstm/tfnetwork.cpp +++ /dev/null @@ -1,146 +0,0 @@ -/////////////////////////////////////////////////////////////////////// -// File: tfnetwork.cpp -// Description: Encapsulation of an entire tensorflow graph as a -// Tesseract Network. -// Author: Ray Smith -// -// (C) Copyright 2016, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -/////////////////////////////////////////////////////////////////////// - -#include // compiler config, etc. - -#ifdef INCLUDE_TENSORFLOW - -# include "tfnetwork.h" - -# include -# include "input.h" -# include "networkscratch.h" - -using tensorflow::Status; -using tensorflow::Tensor; -using tensorflow::TensorShape; - -namespace tesseract { - -TFNetwork::TFNetwork(const std::string &name) : Network(NT_TENSORFLOW, name, 0, 0) {} - -int TFNetwork::InitFromProtoStr(const std::string &proto_str) { - if (!model_proto_.ParseFromString(proto_str)) - return 0; - return InitFromProto(); -} - -// Writes to the given file. Returns false in case of error. -// Should be overridden by subclasses, but called by their Serialize. -bool TFNetwork::Serialize(TFile *fp) const { - if (!Network::Serialize(fp)) - return false; - std::string proto_str; - model_proto_.SerializeToString(&proto_str); - // TODO: optimize and avoid copy from proto_str to data. - std::vector data(proto_str.size()); - memcpy(&data[0], proto_str.data(), proto_str.size()); - return fp->Serialize(data); -} - -// Reads from the given file. Returns false in case of error. -// Should be overridden by subclasses, but NOT called by their DeSerialize. -bool TFNetwork::DeSerialize(TFile *fp) { - std::vector data; - if (!fp->DeSerialize(data)) - return false; - if (!model_proto_.ParseFromArray(&data[0], data.size())) { - return false; - } - return InitFromProto(); -} - -// Runs forward propagation of activations on the input line. -// See Network for a detailed discussion of the arguments. -void TFNetwork::Forward(bool debug, const NetworkIO &input, const TransposedArray *input_transpose, - NetworkScratch *scratch, NetworkIO *output) { - std::vector> tf_inputs; - int depth = input_shape_.depth(); - ASSERT_HOST(depth == input.NumFeatures()); - // TODO(rays) Allow batching. For now batch_size = 1. - const StrideMap &stride_map = input.stride_map(); - // TF requires a tensor of shape float[batch, height, width, depth]. - TensorShape shape{1, stride_map.Size(FD_HEIGHT), stride_map.Size(FD_WIDTH), depth}; - Tensor input_tensor(tensorflow::DT_FLOAT, shape); - // The flat() member gives a 1d array, with a data() member to get the data. - auto eigen_tensor = input_tensor.flat(); - memcpy(eigen_tensor.data(), input.f(0), input.Width() * depth * sizeof(input.f(0)[0])); - // Add the tensor to the vector of inputs. - tf_inputs.emplace_back(model_proto_.image_input(), input_tensor); - - // Provide tensors giving the width and/or height of the image if they are - // required. Some tf ops require a separate tensor with knowledge of the - // size of the input as they cannot obtain it from the input tensor. This is - // usually true in the case of ops that process a batch of variable-sized - // objects. - if (!model_proto_.image_widths().empty()) { - TensorShape size_shape{1}; - Tensor width_tensor(tensorflow::DT_INT64, size_shape); - auto eigen_wtensor = width_tensor.flat(); - *eigen_wtensor.data() = stride_map.Size(FD_WIDTH); - tf_inputs.emplace_back(model_proto_.image_widths(), width_tensor); - } - if (!model_proto_.image_heights().empty()) { - TensorShape size_shape{1}; - Tensor height_tensor(tensorflow::DT_INT64, size_shape); - auto eigen_htensor = height_tensor.flat(); - *eigen_htensor.data() = stride_map.Size(FD_HEIGHT); - tf_inputs.emplace_back(model_proto_.image_heights(), height_tensor); - } - std::vector target_layers = {model_proto_.output_layer()}; - std::vector outputs; - Status s = session_->Run(tf_inputs, target_layers, {}, &outputs); - if (!s.ok()) - tprintError("session->Run failed:{}\n", s.error_message().c_str()); - ASSERT_HOST(s.ok()); - ASSERT_HOST(outputs.size() == 1); - const Tensor &output_tensor = outputs[0]; - // Check the dimensions of the output. - ASSERT_HOST(output_tensor.shape().dims() == 3); - int output_batch = output_tensor.shape().dim_size(0); - int output_steps = output_tensor.shape().dim_size(1); - int output_depth = output_tensor.shape().dim_size(2); - ASSERT_HOST(output_batch == 1); - ASSERT_HOST(output_depth == output_shape_.depth()); - output->Resize2d(false, output_steps, output_depth); - auto eigen_output = output_tensor.flat(); - memcpy(output->f(0), eigen_output.data(), output_steps * output_depth * sizeof(output->f(0)[0])); -} - -int TFNetwork::InitFromProto() { - spec_ = model_proto_.spec(); - input_shape_.SetShape(model_proto_.batch_size(), std::max(0, model_proto_.y_size()), - std::max(0, model_proto_.x_size()), model_proto_.depth()); - output_shape_.SetShape(model_proto_.batch_size(), 1, 0, model_proto_.num_classes()); - output_shape_.set_loss_type(model_proto_.using_ctc() ? LT_CTC : LT_SOFTMAX); - ni_ = input_shape_.height(); - no_ = output_shape_.depth(); - // Initialize the session_ with the graph. Since we can't get the graph - // back from the session_, we have to keep the proto as well - tensorflow::SessionOptions options; - session_.reset(NewSession(options)); - Status s = session_->Create(model_proto_.graph()); - if (s.ok()) - return model_proto_.global_step(); - tprintf("Session_->Create returned '{}'\n", s.error_message().c_str()); - return 0; -} - -} // namespace tesseract - -#endif // ifdef INCLUDE_TENSORFLOW diff --git a/src/lstm/tfnetwork.h b/src/lstm/tfnetwork.h deleted file mode 100644 index e6dc4ccf86..0000000000 --- a/src/lstm/tfnetwork.h +++ /dev/null @@ -1,106 +0,0 @@ -/////////////////////////////////////////////////////////////////////// -// File: tfnetwork.h -// Description: Encapsulation of an entire tensorflow graph as a -// Tesseract Network. -// Author: Ray Smith -// -// (C) Copyright 2016, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -/////////////////////////////////////////////////////////////////////// - -#ifndef TESSERACT_LSTM_TFNETWORK_H_ -#define TESSERACT_LSTM_TFNETWORK_H_ - -#ifdef INCLUDE_TENSORFLOW - -# include -# include - -# include "network.h" -# include "static_shape.h" -# include "tensorflow/core/framework/graph.pb.h" -# include "tensorflow/core/public/session.h" -# include "tfnetwork.pb.h" - -namespace tesseract { - -class TFNetwork : public Network { -public: - explicit TFNetwork(const std::string &name); - virtual ~TFNetwork() = default; - - // Returns the required shape input to the network. - StaticShape InputShape() const override { - return input_shape_; - } - // Returns the shape output from the network given an input shape (which may - // be partially unknown ie zero). - StaticShape OutputShape(const StaticShape &input_shape) const override { - return output_shape_; - } - - std::string spec() const override { - return spec_; - } - - // Deserializes *this from a serialized TFNetwork proto. Returns 0 if failed, - // otherwise the global step of the serialized graph. - int InitFromProtoStr(const std::string &proto_str); - // The number of classes in this network should be equal to those in the - // recoder_ in LSTMRecognizer. - int num_classes() const { - return output_shape_.depth(); - } - - // Writes to the given file. Returns false in case of error. - // Should be overridden by subclasses, but called by their Serialize. - bool Serialize(TFile *fp) const override; - // Reads from the given file. Returns false in case of error. - // Should be overridden by subclasses, but NOT called by their DeSerialize. - bool DeSerialize(TFile *fp) override; - - // Runs forward propagation of activations on the input line. - // See Network for a detailed discussion of the arguments. - void Forward(bool debug, const NetworkIO &input, const TransposedArray *input_transpose, - NetworkScratch *scratch, NetworkIO *output) override; - -private: - // Runs backward propagation of errors on the deltas line. - // See Network for a detailed discussion of the arguments. - bool Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch, - NetworkIO *back_deltas) override { - tprintError("Must override Network::Backward for type {}\n", type_); - return false; - } - - void DebugWeights() override { - tprintError("Must override Network::DebugWeights for type {}\n", type_); - } - - int InitFromProto(); - - // The original network definition for reference. - std::string spec_; - // Input tensor parameters. - StaticShape input_shape_; - // Output tensor parameters. - StaticShape output_shape_; - // The tensor flow graph is contained in here. - std::unique_ptr session_; - // The serialized graph is also contained in here. - TFNetworkModel model_proto_; -}; - -} // namespace tesseract. - -#endif // ifdef INCLUDE_TENSORFLOW - -#endif // TESSERACT_TENSORFLOW_TFNETWORK_H_ diff --git a/src/lstm/tfnetwork.pb.cc b/src/lstm/tfnetwork.pb.cc deleted file mode 100644 index 9524a4fb1e..0000000000 --- a/src/lstm/tfnetwork.pb.cc +++ /dev/null @@ -1,705 +0,0 @@ -// Generated by the protocol buffer compiler. DO NOT EDIT! -// source: tfnetwork.proto - -#include "tfnetwork.pb.h" - -#include - -#include -#include -#include -#include -#include -#include -#include -// @@protoc_insertion_point(includes) -#include - -PROTOBUF_PRAGMA_INIT_SEG -namespace tesseract { -constexpr TFNetworkModel::TFNetworkModel( - ::PROTOBUF_NAMESPACE_ID::internal::ConstantInitialized) - : spec_(&::PROTOBUF_NAMESPACE_ID::internal::fixed_address_empty_string) - , image_input_(&::PROTOBUF_NAMESPACE_ID::internal::fixed_address_empty_string) - , image_widths_(&::PROTOBUF_NAMESPACE_ID::internal::fixed_address_empty_string) - , image_heights_(&::PROTOBUF_NAMESPACE_ID::internal::fixed_address_empty_string) - , output_layer_(&::PROTOBUF_NAMESPACE_ID::internal::fixed_address_empty_string) - , graph_(nullptr) - , global_step_(int64_t{0}) - , depth_(0) - , x_size_(0) - , y_size_(0) - , batch_size_(0) - , num_classes_(0) - , using_ctc_(false){} -struct TFNetworkModelDefaultTypeInternal { - constexpr TFNetworkModelDefaultTypeInternal() - : _instance(::PROTOBUF_NAMESPACE_ID::internal::ConstantInitialized{}) {} - ~TFNetworkModelDefaultTypeInternal() {} - union { - TFNetworkModel _instance; - }; -}; -PROTOBUF_ATTRIBUTE_NO_DESTROY PROTOBUF_CONSTINIT TFNetworkModelDefaultTypeInternal _TFNetworkModel_default_instance_; -} // namespace tesseract -static ::PROTOBUF_NAMESPACE_ID::Metadata file_level_metadata_tfnetwork_2eproto[1]; -static constexpr ::PROTOBUF_NAMESPACE_ID::EnumDescriptor const** file_level_enum_descriptors_tfnetwork_2eproto = nullptr; -static constexpr ::PROTOBUF_NAMESPACE_ID::ServiceDescriptor const** file_level_service_descriptors_tfnetwork_2eproto = nullptr; - -const ::PROTOBUF_NAMESPACE_ID::uint32 TableStruct_tfnetwork_2eproto::offsets[] PROTOBUF_SECTION_VARIABLE(protodesc_cold) = { - ~0u, // no _has_bits_ - PROTOBUF_FIELD_OFFSET(::tesseract::TFNetworkModel, _internal_metadata_), - ~0u, // no _extensions_ - ~0u, // no _oneof_case_ - ~0u, // no _weak_field_map_ - PROTOBUF_FIELD_OFFSET(::tesseract::TFNetworkModel, graph_), - PROTOBUF_FIELD_OFFSET(::tesseract::TFNetworkModel, global_step_), - PROTOBUF_FIELD_OFFSET(::tesseract::TFNetworkModel, spec_), - PROTOBUF_FIELD_OFFSET(::tesseract::TFNetworkModel, depth_), - PROTOBUF_FIELD_OFFSET(::tesseract::TFNetworkModel, x_size_), - PROTOBUF_FIELD_OFFSET(::tesseract::TFNetworkModel, y_size_), - PROTOBUF_FIELD_OFFSET(::tesseract::TFNetworkModel, batch_size_), - PROTOBUF_FIELD_OFFSET(::tesseract::TFNetworkModel, num_classes_), - PROTOBUF_FIELD_OFFSET(::tesseract::TFNetworkModel, using_ctc_), - PROTOBUF_FIELD_OFFSET(::tesseract::TFNetworkModel, image_input_), - PROTOBUF_FIELD_OFFSET(::tesseract::TFNetworkModel, image_widths_), - PROTOBUF_FIELD_OFFSET(::tesseract::TFNetworkModel, image_heights_), - PROTOBUF_FIELD_OFFSET(::tesseract::TFNetworkModel, output_layer_), -}; -static const ::PROTOBUF_NAMESPACE_ID::internal::MigrationSchema schemas[] PROTOBUF_SECTION_VARIABLE(protodesc_cold) = { - { 0, -1, sizeof(::tesseract::TFNetworkModel)}, -}; - -static ::PROTOBUF_NAMESPACE_ID::Message const * const file_default_instances[] = { - reinterpret_cast(&::tesseract::_TFNetworkModel_default_instance_), -}; - -const char descriptor_table_protodef_tfnetwork_2eproto[] PROTOBUF_SECTION_VARIABLE(protodesc_cold) = - "\n\017tfnetwork.proto\022\ttesseract\032%tensorflow" - "/core/framework/graph.proto\"\242\002\n\016TFNetwor" - "kModel\022*\n\005graph\030\001 \001(\0132\033.opencv_tensorflo" - "w.GraphDef\022\023\n\013global_step\030\002 \001(\003\022\014\n\004spec\030" - "\003 \001(\t\022\r\n\005depth\030\004 \001(\005\022\016\n\006x_size\030\005 \001(\005\022\016\n\006" - "y_size\030\006 \001(\005\022\022\n\nbatch_size\030\010 \001(\005\022\023\n\013num_" - "classes\030\t \001(\005\022\021\n\tusing_ctc\030\n \001(\010\022\023\n\013imag" - "e_input\030\013 \001(\t\022\024\n\014image_widths\030\014 \001(\t\022\025\n\ri" - "mage_heights\030\r \001(\t\022\024\n\014output_layer\030\016 \001(\t" - "b\006proto3" - ; -static const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable*const descriptor_table_tfnetwork_2eproto_deps[1] = { - &::descriptor_table_tensorflow_2fcore_2fframework_2fgraph_2eproto, -}; -static ::PROTOBUF_NAMESPACE_ID::internal::once_flag descriptor_table_tfnetwork_2eproto_once; -const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_tfnetwork_2eproto = { - false, false, 368, descriptor_table_protodef_tfnetwork_2eproto, "tfnetwork.proto", - &descriptor_table_tfnetwork_2eproto_once, descriptor_table_tfnetwork_2eproto_deps, 1, 1, - schemas, file_default_instances, TableStruct_tfnetwork_2eproto::offsets, - file_level_metadata_tfnetwork_2eproto, file_level_enum_descriptors_tfnetwork_2eproto, file_level_service_descriptors_tfnetwork_2eproto, -}; -PROTOBUF_ATTRIBUTE_WEAK const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable* descriptor_table_tfnetwork_2eproto_getter() { - return &descriptor_table_tfnetwork_2eproto; -} - -// Force running AddDescriptors() at dynamic initialization time. -PROTOBUF_ATTRIBUTE_INIT_PRIORITY static ::PROTOBUF_NAMESPACE_ID::internal::AddDescriptorsRunner dynamic_init_dummy_tfnetwork_2eproto(&descriptor_table_tfnetwork_2eproto); -namespace tesseract { - -// =================================================================== - -class TFNetworkModel::_Internal { - public: - static const ::opencv_tensorflow::GraphDef& graph(const TFNetworkModel* msg); -}; - -const ::opencv_tensorflow::GraphDef& -TFNetworkModel::_Internal::graph(const TFNetworkModel* msg) { - return *msg->graph_; -} -void TFNetworkModel::clear_graph() { - if (GetArenaForAllocation() == nullptr && graph_ != nullptr) { - delete graph_; - } - graph_ = nullptr; -} -TFNetworkModel::TFNetworkModel(::PROTOBUF_NAMESPACE_ID::Arena* arena, - bool is_message_owned) - : ::PROTOBUF_NAMESPACE_ID::Message(arena, is_message_owned) { - SharedCtor(); - if (!is_message_owned) { - RegisterArenaDtor(arena); - } - // @@protoc_insertion_point(arena_constructor:tesseract.TFNetworkModel) -} -TFNetworkModel::TFNetworkModel(const TFNetworkModel& from) - : ::PROTOBUF_NAMESPACE_ID::Message() { - _internal_metadata_.MergeFrom<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(from._internal_metadata_); - spec_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); - if (!from._internal_spec().empty()) { - spec_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, from._internal_spec(), - GetArenaForAllocation()); - } - image_input_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); - if (!from._internal_image_input().empty()) { - image_input_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, from._internal_image_input(), - GetArenaForAllocation()); - } - image_widths_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); - if (!from._internal_image_widths().empty()) { - image_widths_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, from._internal_image_widths(), - GetArenaForAllocation()); - } - image_heights_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); - if (!from._internal_image_heights().empty()) { - image_heights_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, from._internal_image_heights(), - GetArenaForAllocation()); - } - output_layer_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); - if (!from._internal_output_layer().empty()) { - output_layer_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, from._internal_output_layer(), - GetArenaForAllocation()); - } - if (from._internal_has_graph()) { - graph_ = new ::opencv_tensorflow::GraphDef(*from.graph_); - } else { - graph_ = nullptr; - } - ::memcpy(&global_step_, &from.global_step_, - static_cast(reinterpret_cast(&using_ctc_) - - reinterpret_cast(&global_step_)) + sizeof(using_ctc_)); - // @@protoc_insertion_point(copy_constructor:tesseract.TFNetworkModel) -} - -inline void TFNetworkModel::SharedCtor() { -spec_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); -image_input_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); -image_widths_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); -image_heights_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); -output_layer_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); -::memset(reinterpret_cast(this) + static_cast( - reinterpret_cast(&graph_) - reinterpret_cast(this)), - 0, static_cast(reinterpret_cast(&using_ctc_) - - reinterpret_cast(&graph_)) + sizeof(using_ctc_)); -} - -TFNetworkModel::~TFNetworkModel() { - // @@protoc_insertion_point(destructor:tesseract.TFNetworkModel) - if (GetArenaForAllocation() != nullptr) return; - SharedDtor(); - _internal_metadata_.Delete<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); -} - -inline void TFNetworkModel::SharedDtor() { - GOOGLE_DCHECK(GetArenaForAllocation() == nullptr); - spec_.DestroyNoArena(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); - image_input_.DestroyNoArena(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); - image_widths_.DestroyNoArena(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); - image_heights_.DestroyNoArena(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); - output_layer_.DestroyNoArena(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); - if (this != internal_default_instance()) delete graph_; -} - -void TFNetworkModel::ArenaDtor(void* object) { - TFNetworkModel* _this = reinterpret_cast< TFNetworkModel* >(object); - (void)_this; -} -void TFNetworkModel::RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena*) { -} -void TFNetworkModel::SetCachedSize(int size) const { - _cached_size_.Set(size); -} - -void TFNetworkModel::Clear() { -// @@protoc_insertion_point(message_clear_start:tesseract.TFNetworkModel) - ::PROTOBUF_NAMESPACE_ID::uint32 cached_has_bits = 0; - // Prevent compiler warnings about cached_has_bits being unused - (void) cached_has_bits; - - spec_.ClearToEmpty(); - image_input_.ClearToEmpty(); - image_widths_.ClearToEmpty(); - image_heights_.ClearToEmpty(); - output_layer_.ClearToEmpty(); - if (GetArenaForAllocation() == nullptr && graph_ != nullptr) { - delete graph_; - } - graph_ = nullptr; - ::memset(&global_step_, 0, static_cast( - reinterpret_cast(&using_ctc_) - - reinterpret_cast(&global_step_)) + sizeof(using_ctc_)); - _internal_metadata_.Clear<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(); -} - -const char* TFNetworkModel::_InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) { -#define CHK_(x) if (PROTOBUF_PREDICT_FALSE(!(x))) goto failure - while (!ctx->Done(&ptr)) { - ::PROTOBUF_NAMESPACE_ID::uint32 tag; - ptr = ::PROTOBUF_NAMESPACE_ID::internal::ReadTag(ptr, &tag); - switch (tag >> 3) { - // .opencv_tensorflow.GraphDef graph = 1; - case 1: - if (PROTOBUF_PREDICT_TRUE(static_cast<::PROTOBUF_NAMESPACE_ID::uint8>(tag) == 10)) { - ptr = ctx->ParseMessage(_internal_mutable_graph(), ptr); - CHK_(ptr); - } else goto handle_unusual; - continue; - // int64 global_step = 2; - case 2: - if (PROTOBUF_PREDICT_TRUE(static_cast<::PROTOBUF_NAMESPACE_ID::uint8>(tag) == 16)) { - global_step_ = ::PROTOBUF_NAMESPACE_ID::internal::ReadVarint64(&ptr); - CHK_(ptr); - } else goto handle_unusual; - continue; - // string spec = 3; - case 3: - if (PROTOBUF_PREDICT_TRUE(static_cast<::PROTOBUF_NAMESPACE_ID::uint8>(tag) == 26)) { - auto str = _internal_mutable_spec(); - ptr = ::PROTOBUF_NAMESPACE_ID::internal::InlineGreedyStringParser(str, ptr, ctx); - CHK_(::PROTOBUF_NAMESPACE_ID::internal::VerifyUTF8(str, "tesseract.TFNetworkModel.spec")); - CHK_(ptr); - } else goto handle_unusual; - continue; - // int32 depth = 4; - case 4: - if (PROTOBUF_PREDICT_TRUE(static_cast<::PROTOBUF_NAMESPACE_ID::uint8>(tag) == 32)) { - depth_ = ::PROTOBUF_NAMESPACE_ID::internal::ReadVarint64(&ptr); - CHK_(ptr); - } else goto handle_unusual; - continue; - // int32 x_size = 5; - case 5: - if (PROTOBUF_PREDICT_TRUE(static_cast<::PROTOBUF_NAMESPACE_ID::uint8>(tag) == 40)) { - x_size_ = ::PROTOBUF_NAMESPACE_ID::internal::ReadVarint64(&ptr); - CHK_(ptr); - } else goto handle_unusual; - continue; - // int32 y_size = 6; - case 6: - if (PROTOBUF_PREDICT_TRUE(static_cast<::PROTOBUF_NAMESPACE_ID::uint8>(tag) == 48)) { - y_size_ = ::PROTOBUF_NAMESPACE_ID::internal::ReadVarint64(&ptr); - CHK_(ptr); - } else goto handle_unusual; - continue; - // int32 batch_size = 8; - case 8: - if (PROTOBUF_PREDICT_TRUE(static_cast<::PROTOBUF_NAMESPACE_ID::uint8>(tag) == 64)) { - batch_size_ = ::PROTOBUF_NAMESPACE_ID::internal::ReadVarint64(&ptr); - CHK_(ptr); - } else goto handle_unusual; - continue; - // int32 num_classes = 9; - case 9: - if (PROTOBUF_PREDICT_TRUE(static_cast<::PROTOBUF_NAMESPACE_ID::uint8>(tag) == 72)) { - num_classes_ = ::PROTOBUF_NAMESPACE_ID::internal::ReadVarint64(&ptr); - CHK_(ptr); - } else goto handle_unusual; - continue; - // bool using_ctc = 10; - case 10: - if (PROTOBUF_PREDICT_TRUE(static_cast<::PROTOBUF_NAMESPACE_ID::uint8>(tag) == 80)) { - using_ctc_ = ::PROTOBUF_NAMESPACE_ID::internal::ReadVarint64(&ptr); - CHK_(ptr); - } else goto handle_unusual; - continue; - // string image_input = 11; - case 11: - if (PROTOBUF_PREDICT_TRUE(static_cast<::PROTOBUF_NAMESPACE_ID::uint8>(tag) == 90)) { - auto str = _internal_mutable_image_input(); - ptr = ::PROTOBUF_NAMESPACE_ID::internal::InlineGreedyStringParser(str, ptr, ctx); - CHK_(::PROTOBUF_NAMESPACE_ID::internal::VerifyUTF8(str, "tesseract.TFNetworkModel.image_input")); - CHK_(ptr); - } else goto handle_unusual; - continue; - // string image_widths = 12; - case 12: - if (PROTOBUF_PREDICT_TRUE(static_cast<::PROTOBUF_NAMESPACE_ID::uint8>(tag) == 98)) { - auto str = _internal_mutable_image_widths(); - ptr = ::PROTOBUF_NAMESPACE_ID::internal::InlineGreedyStringParser(str, ptr, ctx); - CHK_(::PROTOBUF_NAMESPACE_ID::internal::VerifyUTF8(str, "tesseract.TFNetworkModel.image_widths")); - CHK_(ptr); - } else goto handle_unusual; - continue; - // string image_heights = 13; - case 13: - if (PROTOBUF_PREDICT_TRUE(static_cast<::PROTOBUF_NAMESPACE_ID::uint8>(tag) == 106)) { - auto str = _internal_mutable_image_heights(); - ptr = ::PROTOBUF_NAMESPACE_ID::internal::InlineGreedyStringParser(str, ptr, ctx); - CHK_(::PROTOBUF_NAMESPACE_ID::internal::VerifyUTF8(str, "tesseract.TFNetworkModel.image_heights")); - CHK_(ptr); - } else goto handle_unusual; - continue; - // string output_layer = 14; - case 14: - if (PROTOBUF_PREDICT_TRUE(static_cast<::PROTOBUF_NAMESPACE_ID::uint8>(tag) == 114)) { - auto str = _internal_mutable_output_layer(); - ptr = ::PROTOBUF_NAMESPACE_ID::internal::InlineGreedyStringParser(str, ptr, ctx); - CHK_(::PROTOBUF_NAMESPACE_ID::internal::VerifyUTF8(str, "tesseract.TFNetworkModel.output_layer")); - CHK_(ptr); - } else goto handle_unusual; - continue; - default: { - handle_unusual: - if ((tag == 0) || ((tag & 7) == 4)) { - CHK_(ptr); - ctx->SetLastTag(tag); - goto success; - } - ptr = UnknownFieldParse(tag, - _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(), - ptr, ctx); - CHK_(ptr != nullptr); - continue; - } - } // switch - } // while -success: - return ptr; -failure: - ptr = nullptr; - goto success; -#undef CHK_ -} - -::PROTOBUF_NAMESPACE_ID::uint8* TFNetworkModel::_InternalSerialize( - ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const { - // @@protoc_insertion_point(serialize_to_array_start:tesseract.TFNetworkModel) - ::PROTOBUF_NAMESPACE_ID::uint32 cached_has_bits = 0; - (void) cached_has_bits; - - // .opencv_tensorflow.GraphDef graph = 1; - if (this->_internal_has_graph()) { - target = stream->EnsureSpace(target); - target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite:: - InternalWriteMessage( - 1, _Internal::graph(this), target, stream); - } - - // int64 global_step = 2; - if (this->_internal_global_step() != 0) { - target = stream->EnsureSpace(target); - target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt64ToArray(2, this->_internal_global_step(), target); - } - - // string spec = 3; - if (!this->_internal_spec().empty()) { - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::VerifyUtf8String( - this->_internal_spec().data(), static_cast(this->_internal_spec().length()), - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::SERIALIZE, - "tesseract.TFNetworkModel.spec"); - target = stream->WriteStringMaybeAliased( - 3, this->_internal_spec(), target); - } - - // int32 depth = 4; - if (this->_internal_depth() != 0) { - target = stream->EnsureSpace(target); - target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(4, this->_internal_depth(), target); - } - - // int32 x_size = 5; - if (this->_internal_x_size() != 0) { - target = stream->EnsureSpace(target); - target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(5, this->_internal_x_size(), target); - } - - // int32 y_size = 6; - if (this->_internal_y_size() != 0) { - target = stream->EnsureSpace(target); - target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(6, this->_internal_y_size(), target); - } - - // int32 batch_size = 8; - if (this->_internal_batch_size() != 0) { - target = stream->EnsureSpace(target); - target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(8, this->_internal_batch_size(), target); - } - - // int32 num_classes = 9; - if (this->_internal_num_classes() != 0) { - target = stream->EnsureSpace(target); - target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt32ToArray(9, this->_internal_num_classes(), target); - } - - // bool using_ctc = 10; - if (this->_internal_using_ctc() != 0) { - target = stream->EnsureSpace(target); - target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteBoolToArray(10, this->_internal_using_ctc(), target); - } - - // string image_input = 11; - if (!this->_internal_image_input().empty()) { - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::VerifyUtf8String( - this->_internal_image_input().data(), static_cast(this->_internal_image_input().length()), - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::SERIALIZE, - "tesseract.TFNetworkModel.image_input"); - target = stream->WriteStringMaybeAliased( - 11, this->_internal_image_input(), target); - } - - // string image_widths = 12; - if (!this->_internal_image_widths().empty()) { - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::VerifyUtf8String( - this->_internal_image_widths().data(), static_cast(this->_internal_image_widths().length()), - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::SERIALIZE, - "tesseract.TFNetworkModel.image_widths"); - target = stream->WriteStringMaybeAliased( - 12, this->_internal_image_widths(), target); - } - - // string image_heights = 13; - if (!this->_internal_image_heights().empty()) { - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::VerifyUtf8String( - this->_internal_image_heights().data(), static_cast(this->_internal_image_heights().length()), - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::SERIALIZE, - "tesseract.TFNetworkModel.image_heights"); - target = stream->WriteStringMaybeAliased( - 13, this->_internal_image_heights(), target); - } - - // string output_layer = 14; - if (!this->_internal_output_layer().empty()) { - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::VerifyUtf8String( - this->_internal_output_layer().data(), static_cast(this->_internal_output_layer().length()), - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::SERIALIZE, - "tesseract.TFNetworkModel.output_layer"); - target = stream->WriteStringMaybeAliased( - 14, this->_internal_output_layer(), target); - } - - if (PROTOBUF_PREDICT_FALSE(_internal_metadata_.have_unknown_fields())) { - target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormat::InternalSerializeUnknownFieldsToArray( - _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance), target, stream); - } - // @@protoc_insertion_point(serialize_to_array_end:tesseract.TFNetworkModel) - return target; -} - -size_t TFNetworkModel::ByteSizeLong() const { -// @@protoc_insertion_point(message_byte_size_start:tesseract.TFNetworkModel) - size_t total_size = 0; - - ::PROTOBUF_NAMESPACE_ID::uint32 cached_has_bits = 0; - // Prevent compiler warnings about cached_has_bits being unused - (void) cached_has_bits; - - // string spec = 3; - if (!this->_internal_spec().empty()) { - total_size += 1 + - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::StringSize( - this->_internal_spec()); - } - - // string image_input = 11; - if (!this->_internal_image_input().empty()) { - total_size += 1 + - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::StringSize( - this->_internal_image_input()); - } - - // string image_widths = 12; - if (!this->_internal_image_widths().empty()) { - total_size += 1 + - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::StringSize( - this->_internal_image_widths()); - } - - // string image_heights = 13; - if (!this->_internal_image_heights().empty()) { - total_size += 1 + - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::StringSize( - this->_internal_image_heights()); - } - - // string output_layer = 14; - if (!this->_internal_output_layer().empty()) { - total_size += 1 + - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::StringSize( - this->_internal_output_layer()); - } - - // .opencv_tensorflow.GraphDef graph = 1; - if (this->_internal_has_graph()) { - total_size += 1 + - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::MessageSize( - *graph_); - } - - // int64 global_step = 2; - if (this->_internal_global_step() != 0) { - total_size += 1 + - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int64Size( - this->_internal_global_step()); - } - - // int32 depth = 4; - if (this->_internal_depth() != 0) { - total_size += 1 + - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size( - this->_internal_depth()); - } - - // int32 x_size = 5; - if (this->_internal_x_size() != 0) { - total_size += 1 + - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size( - this->_internal_x_size()); - } - - // int32 y_size = 6; - if (this->_internal_y_size() != 0) { - total_size += 1 + - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size( - this->_internal_y_size()); - } - - // int32 batch_size = 8; - if (this->_internal_batch_size() != 0) { - total_size += 1 + - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size( - this->_internal_batch_size()); - } - - // int32 num_classes = 9; - if (this->_internal_num_classes() != 0) { - total_size += 1 + - ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int32Size( - this->_internal_num_classes()); - } - - // bool using_ctc = 10; - if (this->_internal_using_ctc() != 0) { - total_size += 1 + 1; - } - - if (PROTOBUF_PREDICT_FALSE(_internal_metadata_.have_unknown_fields())) { - return ::PROTOBUF_NAMESPACE_ID::internal::ComputeUnknownFieldsSize( - _internal_metadata_, total_size, &_cached_size_); - } - int cached_size = ::PROTOBUF_NAMESPACE_ID::internal::ToCachedSize(total_size); - SetCachedSize(cached_size); - return total_size; -} - -const ::PROTOBUF_NAMESPACE_ID::Message::ClassData TFNetworkModel::_class_data_ = { - ::PROTOBUF_NAMESPACE_ID::Message::CopyWithSizeCheck, - TFNetworkModel::MergeImpl -}; -const ::PROTOBUF_NAMESPACE_ID::Message::ClassData*TFNetworkModel::GetClassData() const { return &_class_data_; } - -void TFNetworkModel::MergeImpl(::PROTOBUF_NAMESPACE_ID::Message*to, - const ::PROTOBUF_NAMESPACE_ID::Message&from) { - static_cast(to)->MergeFrom( - static_cast(from)); -} - - -void TFNetworkModel::MergeFrom(const TFNetworkModel& from) { -// @@protoc_insertion_point(class_specific_merge_from_start:tesseract.TFNetworkModel) - GOOGLE_DCHECK_NE(&from, this); - ::PROTOBUF_NAMESPACE_ID::uint32 cached_has_bits = 0; - (void) cached_has_bits; - - if (!from._internal_spec().empty()) { - _internal_set_spec(from._internal_spec()); - } - if (!from._internal_image_input().empty()) { - _internal_set_image_input(from._internal_image_input()); - } - if (!from._internal_image_widths().empty()) { - _internal_set_image_widths(from._internal_image_widths()); - } - if (!from._internal_image_heights().empty()) { - _internal_set_image_heights(from._internal_image_heights()); - } - if (!from._internal_output_layer().empty()) { - _internal_set_output_layer(from._internal_output_layer()); - } - if (from._internal_has_graph()) { - _internal_mutable_graph()->::opencv_tensorflow::GraphDef::MergeFrom(from._internal_graph()); - } - if (from._internal_global_step() != 0) { - _internal_set_global_step(from._internal_global_step()); - } - if (from._internal_depth() != 0) { - _internal_set_depth(from._internal_depth()); - } - if (from._internal_x_size() != 0) { - _internal_set_x_size(from._internal_x_size()); - } - if (from._internal_y_size() != 0) { - _internal_set_y_size(from._internal_y_size()); - } - if (from._internal_batch_size() != 0) { - _internal_set_batch_size(from._internal_batch_size()); - } - if (from._internal_num_classes() != 0) { - _internal_set_num_classes(from._internal_num_classes()); - } - if (from._internal_using_ctc() != 0) { - _internal_set_using_ctc(from._internal_using_ctc()); - } - _internal_metadata_.MergeFrom<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(from._internal_metadata_); -} - -void TFNetworkModel::CopyFrom(const TFNetworkModel& from) { -// @@protoc_insertion_point(class_specific_copy_from_start:tesseract.TFNetworkModel) - if (&from == this) return; - Clear(); - MergeFrom(from); -} - -bool TFNetworkModel::IsInitialized() const { - return true; -} - -void TFNetworkModel::InternalSwap(TFNetworkModel* other) { - using std::swap; - _internal_metadata_.InternalSwap(&other->_internal_metadata_); - ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::InternalSwap( - &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), - &spec_, GetArenaForAllocation(), - &other->spec_, other->GetArenaForAllocation() - ); - ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::InternalSwap( - &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), - &image_input_, GetArenaForAllocation(), - &other->image_input_, other->GetArenaForAllocation() - ); - ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::InternalSwap( - &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), - &image_widths_, GetArenaForAllocation(), - &other->image_widths_, other->GetArenaForAllocation() - ); - ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::InternalSwap( - &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), - &image_heights_, GetArenaForAllocation(), - &other->image_heights_, other->GetArenaForAllocation() - ); - ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::InternalSwap( - &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), - &output_layer_, GetArenaForAllocation(), - &other->output_layer_, other->GetArenaForAllocation() - ); - ::PROTOBUF_NAMESPACE_ID::internal::memswap< - PROTOBUF_FIELD_OFFSET(TFNetworkModel, using_ctc_) - + sizeof(TFNetworkModel::using_ctc_) - - PROTOBUF_FIELD_OFFSET(TFNetworkModel, graph_)>( - reinterpret_cast(&graph_), - reinterpret_cast(&other->graph_)); -} - -::PROTOBUF_NAMESPACE_ID::Metadata TFNetworkModel::GetMetadata() const { - return ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors( - &descriptor_table_tfnetwork_2eproto_getter, &descriptor_table_tfnetwork_2eproto_once, - file_level_metadata_tfnetwork_2eproto[0]); -} - -// @@protoc_insertion_point(namespace_scope) -} // namespace tesseract -PROTOBUF_NAMESPACE_OPEN -template<> PROTOBUF_NOINLINE ::tesseract::TFNetworkModel* Arena::CreateMaybeMessage< ::tesseract::TFNetworkModel >(Arena* arena) { - return Arena::CreateMessageInternal< ::tesseract::TFNetworkModel >(arena); -} -PROTOBUF_NAMESPACE_CLOSE - -// @@protoc_insertion_point(global_scope) -#include diff --git a/src/lstm/tfnetwork.pb.h b/src/lstm/tfnetwork.pb.h deleted file mode 100644 index c4bd52afc1..0000000000 --- a/src/lstm/tfnetwork.pb.h +++ /dev/null @@ -1,850 +0,0 @@ -// Generated by the protocol buffer compiler. DO NOT EDIT! -// source: tfnetwork.proto - -#ifndef GOOGLE_PROTOBUF_INCLUDED_tfnetwork_2eproto -#define GOOGLE_PROTOBUF_INCLUDED_tfnetwork_2eproto - -#include -#include - -#include -#if PROTOBUF_VERSION < 3017000 -#error This file was generated by a newer version of protoc which is -#error incompatible with your Protocol Buffer headers. Please update -#error your headers. -#endif -#if 3017003 < PROTOBUF_MIN_PROTOC_VERSION -#error This file was generated by an older version of protoc which is -#error incompatible with your Protocol Buffer headers. Please -#error regenerate this file with a newer version of protoc. -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include // IWYU pragma: export -#include // IWYU pragma: export -#include -#include "tensorflow/core/framework/graph.pb.h" -// @@protoc_insertion_point(includes) -#include -#define PROTOBUF_INTERNAL_EXPORT_tfnetwork_2eproto -PROTOBUF_NAMESPACE_OPEN -namespace internal { -class AnyMetadata; -} // namespace internal -PROTOBUF_NAMESPACE_CLOSE - -// Internal implementation detail -- do not use these members. -struct TableStruct_tfnetwork_2eproto { - static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTableField entries[] - PROTOBUF_SECTION_VARIABLE(protodesc_cold); - static const ::PROTOBUF_NAMESPACE_ID::internal::AuxiliaryParseTableField aux[] - PROTOBUF_SECTION_VARIABLE(protodesc_cold); - static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[1] - PROTOBUF_SECTION_VARIABLE(protodesc_cold); - static const ::PROTOBUF_NAMESPACE_ID::internal::FieldMetadata field_metadata[]; - static const ::PROTOBUF_NAMESPACE_ID::internal::SerializationTable serialization_table[]; - static const ::PROTOBUF_NAMESPACE_ID::uint32 offsets[]; -}; -extern const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_tfnetwork_2eproto; -namespace tesseract { -class TFNetworkModel; -struct TFNetworkModelDefaultTypeInternal; -extern TFNetworkModelDefaultTypeInternal _TFNetworkModel_default_instance_; -} // namespace tesseract -PROTOBUF_NAMESPACE_OPEN -template<> ::tesseract::TFNetworkModel* Arena::CreateMaybeMessage<::tesseract::TFNetworkModel>(Arena*); -PROTOBUF_NAMESPACE_CLOSE -namespace tesseract { - -// =================================================================== - -class TFNetworkModel final : - public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:tesseract.TFNetworkModel) */ { - public: - inline TFNetworkModel() : TFNetworkModel(nullptr) {} - ~TFNetworkModel() override; - explicit constexpr TFNetworkModel(::PROTOBUF_NAMESPACE_ID::internal::ConstantInitialized); - - TFNetworkModel(const TFNetworkModel& from); - TFNetworkModel(TFNetworkModel&& from) noexcept - : TFNetworkModel() { - *this = ::std::move(from); - } - - inline TFNetworkModel& operator=(const TFNetworkModel& from) { - CopyFrom(from); - return *this; - } - inline TFNetworkModel& operator=(TFNetworkModel&& from) noexcept { - if (this == &from) return *this; - if (GetOwningArena() == from.GetOwningArena()) { - InternalSwap(&from); - } else { - CopyFrom(from); - } - return *this; - } - - static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { - return GetDescriptor(); - } - static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { - return default_instance().GetMetadata().descriptor; - } - static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { - return default_instance().GetMetadata().reflection; - } - static const TFNetworkModel& default_instance() { - return *internal_default_instance(); - } - static inline const TFNetworkModel* internal_default_instance() { - return reinterpret_cast( - &_TFNetworkModel_default_instance_); - } - static constexpr int kIndexInFileMessages = - 0; - - friend void swap(TFNetworkModel& a, TFNetworkModel& b) { - a.Swap(&b); - } - inline void Swap(TFNetworkModel* other) { - if (other == this) return; - if (GetOwningArena() == other->GetOwningArena()) { - InternalSwap(other); - } else { - ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); - } - } - void UnsafeArenaSwap(TFNetworkModel* other) { - if (other == this) return; - GOOGLE_DCHECK(GetOwningArena() == other->GetOwningArena()); - InternalSwap(other); - } - - // implements Message ---------------------------------------------- - - inline TFNetworkModel* New() const final { - return new TFNetworkModel(); - } - - TFNetworkModel* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { - return CreateMaybeMessage(arena); - } - using ::PROTOBUF_NAMESPACE_ID::Message::CopyFrom; - void CopyFrom(const TFNetworkModel& from); - using ::PROTOBUF_NAMESPACE_ID::Message::MergeFrom; - void MergeFrom(const TFNetworkModel& from); - private: - static void MergeImpl(::PROTOBUF_NAMESPACE_ID::Message*to, const ::PROTOBUF_NAMESPACE_ID::Message&from); - public: - PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; - bool IsInitialized() const final; - - size_t ByteSizeLong() const final; - const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; - ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( - ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; - int GetCachedSize() const final { return _cached_size_.Get(); } - - private: - void SharedCtor(); - void SharedDtor(); - void SetCachedSize(int size) const final; - void InternalSwap(TFNetworkModel* other); - friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; - static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { - return "tesseract.TFNetworkModel"; - } - protected: - explicit TFNetworkModel(::PROTOBUF_NAMESPACE_ID::Arena* arena, - bool is_message_owned = false); - private: - static void ArenaDtor(void* object); - inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); - public: - - static const ClassData _class_data_; - const ::PROTOBUF_NAMESPACE_ID::Message::ClassData*GetClassData() const final; - - ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; - - // nested types ---------------------------------------------------- - - // accessors ------------------------------------------------------- - - enum : int { - kSpecFieldNumber = 3, - kImageInputFieldNumber = 11, - kImageWidthsFieldNumber = 12, - kImageHeightsFieldNumber = 13, - kOutputLayerFieldNumber = 14, - kGraphFieldNumber = 1, - kGlobalStepFieldNumber = 2, - kDepthFieldNumber = 4, - kXSizeFieldNumber = 5, - kYSizeFieldNumber = 6, - kBatchSizeFieldNumber = 8, - kNumClassesFieldNumber = 9, - kUsingCtcFieldNumber = 10, - }; - // string spec = 3; - void clear_spec(); - const std::string& spec() const; - template - void set_spec(ArgT0&& arg0, ArgT... args); - std::string* mutable_spec(); - PROTOBUF_MUST_USE_RESULT std::string* release_spec(); - void set_allocated_spec(std::string* spec); - private: - const std::string& _internal_spec() const; - inline PROTOBUF_ALWAYS_INLINE void _internal_set_spec(const std::string& value); - std::string* _internal_mutable_spec(); - public: - - // string image_input = 11; - void clear_image_input(); - const std::string& image_input() const; - template - void set_image_input(ArgT0&& arg0, ArgT... args); - std::string* mutable_image_input(); - PROTOBUF_MUST_USE_RESULT std::string* release_image_input(); - void set_allocated_image_input(std::string* image_input); - private: - const std::string& _internal_image_input() const; - inline PROTOBUF_ALWAYS_INLINE void _internal_set_image_input(const std::string& value); - std::string* _internal_mutable_image_input(); - public: - - // string image_widths = 12; - void clear_image_widths(); - const std::string& image_widths() const; - template - void set_image_widths(ArgT0&& arg0, ArgT... args); - std::string* mutable_image_widths(); - PROTOBUF_MUST_USE_RESULT std::string* release_image_widths(); - void set_allocated_image_widths(std::string* image_widths); - private: - const std::string& _internal_image_widths() const; - inline PROTOBUF_ALWAYS_INLINE void _internal_set_image_widths(const std::string& value); - std::string* _internal_mutable_image_widths(); - public: - - // string image_heights = 13; - void clear_image_heights(); - const std::string& image_heights() const; - template - void set_image_heights(ArgT0&& arg0, ArgT... args); - std::string* mutable_image_heights(); - PROTOBUF_MUST_USE_RESULT std::string* release_image_heights(); - void set_allocated_image_heights(std::string* image_heights); - private: - const std::string& _internal_image_heights() const; - inline PROTOBUF_ALWAYS_INLINE void _internal_set_image_heights(const std::string& value); - std::string* _internal_mutable_image_heights(); - public: - - // string output_layer = 14; - void clear_output_layer(); - const std::string& output_layer() const; - template - void set_output_layer(ArgT0&& arg0, ArgT... args); - std::string* mutable_output_layer(); - PROTOBUF_MUST_USE_RESULT std::string* release_output_layer(); - void set_allocated_output_layer(std::string* output_layer); - private: - const std::string& _internal_output_layer() const; - inline PROTOBUF_ALWAYS_INLINE void _internal_set_output_layer(const std::string& value); - std::string* _internal_mutable_output_layer(); - public: - - // .opencv_tensorflow.GraphDef graph = 1; - bool has_graph() const; - private: - bool _internal_has_graph() const; - public: - void clear_graph(); - const ::opencv_tensorflow::GraphDef& graph() const; - PROTOBUF_MUST_USE_RESULT ::opencv_tensorflow::GraphDef* release_graph(); - ::opencv_tensorflow::GraphDef* mutable_graph(); - void set_allocated_graph(::opencv_tensorflow::GraphDef* graph); - private: - const ::opencv_tensorflow::GraphDef& _internal_graph() const; - ::opencv_tensorflow::GraphDef* _internal_mutable_graph(); - public: - void unsafe_arena_set_allocated_graph( - ::opencv_tensorflow::GraphDef* graph); - ::opencv_tensorflow::GraphDef* unsafe_arena_release_graph(); - - // int64 global_step = 2; - void clear_global_step(); - ::PROTOBUF_NAMESPACE_ID::int64 global_step() const; - void set_global_step(::PROTOBUF_NAMESPACE_ID::int64 value); - private: - ::PROTOBUF_NAMESPACE_ID::int64 _internal_global_step() const; - void _internal_set_global_step(::PROTOBUF_NAMESPACE_ID::int64 value); - public: - - // int32 depth = 4; - void clear_depth(); - ::PROTOBUF_NAMESPACE_ID::int32 depth() const; - void set_depth(::PROTOBUF_NAMESPACE_ID::int32 value); - private: - ::PROTOBUF_NAMESPACE_ID::int32 _internal_depth() const; - void _internal_set_depth(::PROTOBUF_NAMESPACE_ID::int32 value); - public: - - // int32 x_size = 5; - void clear_x_size(); - ::PROTOBUF_NAMESPACE_ID::int32 x_size() const; - void set_x_size(::PROTOBUF_NAMESPACE_ID::int32 value); - private: - ::PROTOBUF_NAMESPACE_ID::int32 _internal_x_size() const; - void _internal_set_x_size(::PROTOBUF_NAMESPACE_ID::int32 value); - public: - - // int32 y_size = 6; - void clear_y_size(); - ::PROTOBUF_NAMESPACE_ID::int32 y_size() const; - void set_y_size(::PROTOBUF_NAMESPACE_ID::int32 value); - private: - ::PROTOBUF_NAMESPACE_ID::int32 _internal_y_size() const; - void _internal_set_y_size(::PROTOBUF_NAMESPACE_ID::int32 value); - public: - - // int32 batch_size = 8; - void clear_batch_size(); - ::PROTOBUF_NAMESPACE_ID::int32 batch_size() const; - void set_batch_size(::PROTOBUF_NAMESPACE_ID::int32 value); - private: - ::PROTOBUF_NAMESPACE_ID::int32 _internal_batch_size() const; - void _internal_set_batch_size(::PROTOBUF_NAMESPACE_ID::int32 value); - public: - - // int32 num_classes = 9; - void clear_num_classes(); - ::PROTOBUF_NAMESPACE_ID::int32 num_classes() const; - void set_num_classes(::PROTOBUF_NAMESPACE_ID::int32 value); - private: - ::PROTOBUF_NAMESPACE_ID::int32 _internal_num_classes() const; - void _internal_set_num_classes(::PROTOBUF_NAMESPACE_ID::int32 value); - public: - - // bool using_ctc = 10; - void clear_using_ctc(); - bool using_ctc() const; - void set_using_ctc(bool value); - private: - bool _internal_using_ctc() const; - void _internal_set_using_ctc(bool value); - public: - - // @@protoc_insertion_point(class_scope:tesseract.TFNetworkModel) - private: - class _Internal; - - template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; - typedef void InternalArenaConstructable_; - typedef void DestructorSkippable_; - ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr spec_; - ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr image_input_; - ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr image_widths_; - ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr image_heights_; - ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr output_layer_; - ::opencv_tensorflow::GraphDef* graph_; - ::PROTOBUF_NAMESPACE_ID::int64 global_step_; - ::PROTOBUF_NAMESPACE_ID::int32 depth_; - ::PROTOBUF_NAMESPACE_ID::int32 x_size_; - ::PROTOBUF_NAMESPACE_ID::int32 y_size_; - ::PROTOBUF_NAMESPACE_ID::int32 batch_size_; - ::PROTOBUF_NAMESPACE_ID::int32 num_classes_; - bool using_ctc_; - mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; - friend struct ::TableStruct_tfnetwork_2eproto; -}; -// =================================================================== - - -// =================================================================== - -#ifdef __GNUC__ - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wstrict-aliasing" -#endif // __GNUC__ -// TFNetworkModel - -// .opencv_tensorflow.GraphDef graph = 1; -inline bool TFNetworkModel::_internal_has_graph() const { - return this != internal_default_instance() && graph_ != nullptr; -} -inline bool TFNetworkModel::has_graph() const { - return _internal_has_graph(); -} -inline const ::opencv_tensorflow::GraphDef& TFNetworkModel::_internal_graph() const { - const ::opencv_tensorflow::GraphDef* p = graph_; - return p != nullptr ? *p : reinterpret_cast( - ::opencv_tensorflow::_GraphDef_default_instance_); -} -inline const ::opencv_tensorflow::GraphDef& TFNetworkModel::graph() const { - // @@protoc_insertion_point(field_get:tesseract.TFNetworkModel.graph) - return _internal_graph(); -} -inline void TFNetworkModel::unsafe_arena_set_allocated_graph( - ::opencv_tensorflow::GraphDef* graph) { - if (GetArenaForAllocation() == nullptr) { - delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(graph_); - } - graph_ = graph; - if (graph) { - - } else { - - } - // @@protoc_insertion_point(field_unsafe_arena_set_allocated:tesseract.TFNetworkModel.graph) -} -inline ::opencv_tensorflow::GraphDef* TFNetworkModel::release_graph() { - - ::opencv_tensorflow::GraphDef* temp = graph_; - graph_ = nullptr; -#ifdef PROTOBUF_FORCE_COPY_IN_RELEASE - auto* old = reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(temp); - temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp); - if (GetArenaForAllocation() == nullptr) { delete old; } -#else // PROTOBUF_FORCE_COPY_IN_RELEASE - if (GetArenaForAllocation() != nullptr) { - temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp); - } -#endif // !PROTOBUF_FORCE_COPY_IN_RELEASE - return temp; -} -inline ::opencv_tensorflow::GraphDef* TFNetworkModel::unsafe_arena_release_graph() { - // @@protoc_insertion_point(field_release:tesseract.TFNetworkModel.graph) - - ::opencv_tensorflow::GraphDef* temp = graph_; - graph_ = nullptr; - return temp; -} -inline ::opencv_tensorflow::GraphDef* TFNetworkModel::_internal_mutable_graph() { - - if (graph_ == nullptr) { - auto* p = CreateMaybeMessage<::opencv_tensorflow::GraphDef>(GetArenaForAllocation()); - graph_ = p; - } - return graph_; -} -inline ::opencv_tensorflow::GraphDef* TFNetworkModel::mutable_graph() { - ::opencv_tensorflow::GraphDef* _msg = _internal_mutable_graph(); - // @@protoc_insertion_point(field_mutable:tesseract.TFNetworkModel.graph) - return _msg; -} -inline void TFNetworkModel::set_allocated_graph(::opencv_tensorflow::GraphDef* graph) { - ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArenaForAllocation(); - if (message_arena == nullptr) { - delete reinterpret_cast< ::PROTOBUF_NAMESPACE_ID::MessageLite*>(graph_); - } - if (graph) { - ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena = - ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper< - ::PROTOBUF_NAMESPACE_ID::MessageLite>::GetOwningArena( - reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(graph)); - if (message_arena != submessage_arena) { - graph = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage( - message_arena, graph, submessage_arena); - } - - } else { - - } - graph_ = graph; - // @@protoc_insertion_point(field_set_allocated:tesseract.TFNetworkModel.graph) -} - -// int64 global_step = 2; -inline void TFNetworkModel::clear_global_step() { - global_step_ = int64_t{0}; -} -inline ::PROTOBUF_NAMESPACE_ID::int64 TFNetworkModel::_internal_global_step() const { - return global_step_; -} -inline ::PROTOBUF_NAMESPACE_ID::int64 TFNetworkModel::global_step() const { - // @@protoc_insertion_point(field_get:tesseract.TFNetworkModel.global_step) - return _internal_global_step(); -} -inline void TFNetworkModel::_internal_set_global_step(::PROTOBUF_NAMESPACE_ID::int64 value) { - - global_step_ = value; -} -inline void TFNetworkModel::set_global_step(::PROTOBUF_NAMESPACE_ID::int64 value) { - _internal_set_global_step(value); - // @@protoc_insertion_point(field_set:tesseract.TFNetworkModel.global_step) -} - -// string spec = 3; -inline void TFNetworkModel::clear_spec() { - spec_.ClearToEmpty(); -} -inline const std::string& TFNetworkModel::spec() const { - // @@protoc_insertion_point(field_get:tesseract.TFNetworkModel.spec) - return _internal_spec(); -} -template -inline PROTOBUF_ALWAYS_INLINE -void TFNetworkModel::set_spec(ArgT0&& arg0, ArgT... args) { - - spec_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, static_cast(arg0), args..., GetArenaForAllocation()); - // @@protoc_insertion_point(field_set:tesseract.TFNetworkModel.spec) -} -inline std::string* TFNetworkModel::mutable_spec() { - std::string* _s = _internal_mutable_spec(); - // @@protoc_insertion_point(field_mutable:tesseract.TFNetworkModel.spec) - return _s; -} -inline const std::string& TFNetworkModel::_internal_spec() const { - return spec_.Get(); -} -inline void TFNetworkModel::_internal_set_spec(const std::string& value) { - - spec_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, value, GetArenaForAllocation()); -} -inline std::string* TFNetworkModel::_internal_mutable_spec() { - - return spec_.Mutable(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, GetArenaForAllocation()); -} -inline std::string* TFNetworkModel::release_spec() { - // @@protoc_insertion_point(field_release:tesseract.TFNetworkModel.spec) - return spec_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArenaForAllocation()); -} -inline void TFNetworkModel::set_allocated_spec(std::string* spec) { - if (spec != nullptr) { - - } else { - - } - spec_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), spec, - GetArenaForAllocation()); - // @@protoc_insertion_point(field_set_allocated:tesseract.TFNetworkModel.spec) -} - -// int32 depth = 4; -inline void TFNetworkModel::clear_depth() { - depth_ = 0; -} -inline ::PROTOBUF_NAMESPACE_ID::int32 TFNetworkModel::_internal_depth() const { - return depth_; -} -inline ::PROTOBUF_NAMESPACE_ID::int32 TFNetworkModel::depth() const { - // @@protoc_insertion_point(field_get:tesseract.TFNetworkModel.depth) - return _internal_depth(); -} -inline void TFNetworkModel::_internal_set_depth(::PROTOBUF_NAMESPACE_ID::int32 value) { - - depth_ = value; -} -inline void TFNetworkModel::set_depth(::PROTOBUF_NAMESPACE_ID::int32 value) { - _internal_set_depth(value); - // @@protoc_insertion_point(field_set:tesseract.TFNetworkModel.depth) -} - -// int32 x_size = 5; -inline void TFNetworkModel::clear_x_size() { - x_size_ = 0; -} -inline ::PROTOBUF_NAMESPACE_ID::int32 TFNetworkModel::_internal_x_size() const { - return x_size_; -} -inline ::PROTOBUF_NAMESPACE_ID::int32 TFNetworkModel::x_size() const { - // @@protoc_insertion_point(field_get:tesseract.TFNetworkModel.x_size) - return _internal_x_size(); -} -inline void TFNetworkModel::_internal_set_x_size(::PROTOBUF_NAMESPACE_ID::int32 value) { - - x_size_ = value; -} -inline void TFNetworkModel::set_x_size(::PROTOBUF_NAMESPACE_ID::int32 value) { - _internal_set_x_size(value); - // @@protoc_insertion_point(field_set:tesseract.TFNetworkModel.x_size) -} - -// int32 y_size = 6; -inline void TFNetworkModel::clear_y_size() { - y_size_ = 0; -} -inline ::PROTOBUF_NAMESPACE_ID::int32 TFNetworkModel::_internal_y_size() const { - return y_size_; -} -inline ::PROTOBUF_NAMESPACE_ID::int32 TFNetworkModel::y_size() const { - // @@protoc_insertion_point(field_get:tesseract.TFNetworkModel.y_size) - return _internal_y_size(); -} -inline void TFNetworkModel::_internal_set_y_size(::PROTOBUF_NAMESPACE_ID::int32 value) { - - y_size_ = value; -} -inline void TFNetworkModel::set_y_size(::PROTOBUF_NAMESPACE_ID::int32 value) { - _internal_set_y_size(value); - // @@protoc_insertion_point(field_set:tesseract.TFNetworkModel.y_size) -} - -// int32 batch_size = 8; -inline void TFNetworkModel::clear_batch_size() { - batch_size_ = 0; -} -inline ::PROTOBUF_NAMESPACE_ID::int32 TFNetworkModel::_internal_batch_size() const { - return batch_size_; -} -inline ::PROTOBUF_NAMESPACE_ID::int32 TFNetworkModel::batch_size() const { - // @@protoc_insertion_point(field_get:tesseract.TFNetworkModel.batch_size) - return _internal_batch_size(); -} -inline void TFNetworkModel::_internal_set_batch_size(::PROTOBUF_NAMESPACE_ID::int32 value) { - - batch_size_ = value; -} -inline void TFNetworkModel::set_batch_size(::PROTOBUF_NAMESPACE_ID::int32 value) { - _internal_set_batch_size(value); - // @@protoc_insertion_point(field_set:tesseract.TFNetworkModel.batch_size) -} - -// int32 num_classes = 9; -inline void TFNetworkModel::clear_num_classes() { - num_classes_ = 0; -} -inline ::PROTOBUF_NAMESPACE_ID::int32 TFNetworkModel::_internal_num_classes() const { - return num_classes_; -} -inline ::PROTOBUF_NAMESPACE_ID::int32 TFNetworkModel::num_classes() const { - // @@protoc_insertion_point(field_get:tesseract.TFNetworkModel.num_classes) - return _internal_num_classes(); -} -inline void TFNetworkModel::_internal_set_num_classes(::PROTOBUF_NAMESPACE_ID::int32 value) { - - num_classes_ = value; -} -inline void TFNetworkModel::set_num_classes(::PROTOBUF_NAMESPACE_ID::int32 value) { - _internal_set_num_classes(value); - // @@protoc_insertion_point(field_set:tesseract.TFNetworkModel.num_classes) -} - -// bool using_ctc = 10; -inline void TFNetworkModel::clear_using_ctc() { - using_ctc_ = false; -} -inline bool TFNetworkModel::_internal_using_ctc() const { - return using_ctc_; -} -inline bool TFNetworkModel::using_ctc() const { - // @@protoc_insertion_point(field_get:tesseract.TFNetworkModel.using_ctc) - return _internal_using_ctc(); -} -inline void TFNetworkModel::_internal_set_using_ctc(bool value) { - - using_ctc_ = value; -} -inline void TFNetworkModel::set_using_ctc(bool value) { - _internal_set_using_ctc(value); - // @@protoc_insertion_point(field_set:tesseract.TFNetworkModel.using_ctc) -} - -// string image_input = 11; -inline void TFNetworkModel::clear_image_input() { - image_input_.ClearToEmpty(); -} -inline const std::string& TFNetworkModel::image_input() const { - // @@protoc_insertion_point(field_get:tesseract.TFNetworkModel.image_input) - return _internal_image_input(); -} -template -inline PROTOBUF_ALWAYS_INLINE -void TFNetworkModel::set_image_input(ArgT0&& arg0, ArgT... args) { - - image_input_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, static_cast(arg0), args..., GetArenaForAllocation()); - // @@protoc_insertion_point(field_set:tesseract.TFNetworkModel.image_input) -} -inline std::string* TFNetworkModel::mutable_image_input() { - std::string* _s = _internal_mutable_image_input(); - // @@protoc_insertion_point(field_mutable:tesseract.TFNetworkModel.image_input) - return _s; -} -inline const std::string& TFNetworkModel::_internal_image_input() const { - return image_input_.Get(); -} -inline void TFNetworkModel::_internal_set_image_input(const std::string& value) { - - image_input_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, value, GetArenaForAllocation()); -} -inline std::string* TFNetworkModel::_internal_mutable_image_input() { - - return image_input_.Mutable(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, GetArenaForAllocation()); -} -inline std::string* TFNetworkModel::release_image_input() { - // @@protoc_insertion_point(field_release:tesseract.TFNetworkModel.image_input) - return image_input_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArenaForAllocation()); -} -inline void TFNetworkModel::set_allocated_image_input(std::string* image_input) { - if (image_input != nullptr) { - - } else { - - } - image_input_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), image_input, - GetArenaForAllocation()); - // @@protoc_insertion_point(field_set_allocated:tesseract.TFNetworkModel.image_input) -} - -// string image_widths = 12; -inline void TFNetworkModel::clear_image_widths() { - image_widths_.ClearToEmpty(); -} -inline const std::string& TFNetworkModel::image_widths() const { - // @@protoc_insertion_point(field_get:tesseract.TFNetworkModel.image_widths) - return _internal_image_widths(); -} -template -inline PROTOBUF_ALWAYS_INLINE -void TFNetworkModel::set_image_widths(ArgT0&& arg0, ArgT... args) { - - image_widths_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, static_cast(arg0), args..., GetArenaForAllocation()); - // @@protoc_insertion_point(field_set:tesseract.TFNetworkModel.image_widths) -} -inline std::string* TFNetworkModel::mutable_image_widths() { - std::string* _s = _internal_mutable_image_widths(); - // @@protoc_insertion_point(field_mutable:tesseract.TFNetworkModel.image_widths) - return _s; -} -inline const std::string& TFNetworkModel::_internal_image_widths() const { - return image_widths_.Get(); -} -inline void TFNetworkModel::_internal_set_image_widths(const std::string& value) { - - image_widths_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, value, GetArenaForAllocation()); -} -inline std::string* TFNetworkModel::_internal_mutable_image_widths() { - - return image_widths_.Mutable(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, GetArenaForAllocation()); -} -inline std::string* TFNetworkModel::release_image_widths() { - // @@protoc_insertion_point(field_release:tesseract.TFNetworkModel.image_widths) - return image_widths_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArenaForAllocation()); -} -inline void TFNetworkModel::set_allocated_image_widths(std::string* image_widths) { - if (image_widths != nullptr) { - - } else { - - } - image_widths_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), image_widths, - GetArenaForAllocation()); - // @@protoc_insertion_point(field_set_allocated:tesseract.TFNetworkModel.image_widths) -} - -// string image_heights = 13; -inline void TFNetworkModel::clear_image_heights() { - image_heights_.ClearToEmpty(); -} -inline const std::string& TFNetworkModel::image_heights() const { - // @@protoc_insertion_point(field_get:tesseract.TFNetworkModel.image_heights) - return _internal_image_heights(); -} -template -inline PROTOBUF_ALWAYS_INLINE -void TFNetworkModel::set_image_heights(ArgT0&& arg0, ArgT... args) { - - image_heights_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, static_cast(arg0), args..., GetArenaForAllocation()); - // @@protoc_insertion_point(field_set:tesseract.TFNetworkModel.image_heights) -} -inline std::string* TFNetworkModel::mutable_image_heights() { - std::string* _s = _internal_mutable_image_heights(); - // @@protoc_insertion_point(field_mutable:tesseract.TFNetworkModel.image_heights) - return _s; -} -inline const std::string& TFNetworkModel::_internal_image_heights() const { - return image_heights_.Get(); -} -inline void TFNetworkModel::_internal_set_image_heights(const std::string& value) { - - image_heights_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, value, GetArenaForAllocation()); -} -inline std::string* TFNetworkModel::_internal_mutable_image_heights() { - - return image_heights_.Mutable(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, GetArenaForAllocation()); -} -inline std::string* TFNetworkModel::release_image_heights() { - // @@protoc_insertion_point(field_release:tesseract.TFNetworkModel.image_heights) - return image_heights_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArenaForAllocation()); -} -inline void TFNetworkModel::set_allocated_image_heights(std::string* image_heights) { - if (image_heights != nullptr) { - - } else { - - } - image_heights_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), image_heights, - GetArenaForAllocation()); - // @@protoc_insertion_point(field_set_allocated:tesseract.TFNetworkModel.image_heights) -} - -// string output_layer = 14; -inline void TFNetworkModel::clear_output_layer() { - output_layer_.ClearToEmpty(); -} -inline const std::string& TFNetworkModel::output_layer() const { - // @@protoc_insertion_point(field_get:tesseract.TFNetworkModel.output_layer) - return _internal_output_layer(); -} -template -inline PROTOBUF_ALWAYS_INLINE -void TFNetworkModel::set_output_layer(ArgT0&& arg0, ArgT... args) { - - output_layer_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, static_cast(arg0), args..., GetArenaForAllocation()); - // @@protoc_insertion_point(field_set:tesseract.TFNetworkModel.output_layer) -} -inline std::string* TFNetworkModel::mutable_output_layer() { - std::string* _s = _internal_mutable_output_layer(); - // @@protoc_insertion_point(field_mutable:tesseract.TFNetworkModel.output_layer) - return _s; -} -inline const std::string& TFNetworkModel::_internal_output_layer() const { - return output_layer_.Get(); -} -inline void TFNetworkModel::_internal_set_output_layer(const std::string& value) { - - output_layer_.Set(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, value, GetArenaForAllocation()); -} -inline std::string* TFNetworkModel::_internal_mutable_output_layer() { - - return output_layer_.Mutable(::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr::EmptyDefault{}, GetArenaForAllocation()); -} -inline std::string* TFNetworkModel::release_output_layer() { - // @@protoc_insertion_point(field_release:tesseract.TFNetworkModel.output_layer) - return output_layer_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArenaForAllocation()); -} -inline void TFNetworkModel::set_allocated_output_layer(std::string* output_layer) { - if (output_layer != nullptr) { - - } else { - - } - output_layer_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), output_layer, - GetArenaForAllocation()); - // @@protoc_insertion_point(field_set_allocated:tesseract.TFNetworkModel.output_layer) -} - -#ifdef __GNUC__ - #pragma GCC diagnostic pop -#endif // __GNUC__ - -// @@protoc_insertion_point(namespace_scope) - -} // namespace tesseract - -// @@protoc_insertion_point(global_scope) - -#include -#endif // GOOGLE_PROTOBUF_INCLUDED_GOOGLE_PROTOBUF_INCLUDED_tfnetwork_2eproto diff --git a/src/lstm/tfnetwork.proto b/src/lstm/tfnetwork.proto deleted file mode 100644 index fb505bfccd..0000000000 --- a/src/lstm/tfnetwork.proto +++ /dev/null @@ -1,74 +0,0 @@ -// Protocol description for Tesseract - -// Compile this file with the Protocol Compiler protoc to generate -// the files tfnetwork.pb.cc and tfnetwork.pb.h. - -// This requires the protocol descriptions for TensorFlow -// (included in the TensorFlow sources). - -// With TensorFlow sources installed in /usr/src/tensorflow/tensorflow-1.10.1, -// this command was used on Debian to generate the files: - -// protoc --cpp_out=$PWD --proto_path=/usr/src/tensorflow/tensorflow-1.10.1 \ -// --proto_path=$PWD src/lstm/tfnetwork.proto - -syntax = "proto3"; - -package tesseract; - -import "tensorflow/core/framework/graph.proto"; - -// This proto is the interface between a python TF graph builder/trainer and -// the C++ world. The writer of this proto must provide fields as documented -// by the comments below. -// The graph must have a placeholder for NetworkIO, Widths and Heights. The -// following python code creates the appropriate placeholders: -// -// input_layer = tf.placeholder(tf.float32, -// shape=[batch_size, xsize, ysize, depth_dim], -// name='NetworkIO') -// widths = tf.placeholder(tf.int32, shape=[batch_size], name='Widths') -// heights = tf.placeholder(tf.int32, shape=[batch_size], name='Heights') -// # Flip x and y to the TF convention. -// input_layer = tf.transpose(input_layer, [0, 2, 1, 3]) -// -// The widths and heights will be set to indicate the post-scaling size of the -// input image(s). -// For now batch_size is ignored and set to 1. -// The graph should return a 2-dimensional float32 tensor called 'softmax' of -// shape [sequence_length, num_classes], where sequence_length is allowed to -// be variable, given by the tensor itself. -// TODO(rays) determine whether it is worth providing for batch_size >1 and if -// so, how. -message TFNetworkModel { - // The TF graph definition. Required. - opencv_tensorflow.GraphDef graph = 1; - // The training index. Required to be > 0. - int64 global_step = 2; - // The original network definition for reference. Optional - string spec = 3; - // Input tensor parameters. - // Values per pixel. Required to be 1 or 3. Inputs assumed to be float32. - int32 depth = 4; - // Image size. Required. Zero implies flexible sizes, fixed if non-zero. - // If x_size > 0, images will be cropped/padded to the given size, after - // any scaling required by the y_size. - // If y_size > 0, images will be scaled isotropically to the given height. - int32 x_size = 5; - int32 y_size = 6; - // Number of images in a batch. Optional. - int32 batch_size = 8; - // Output tensor parameters. - // Number of output classes. Required to match the depth of the softmax. - int32 num_classes = 9; - // True if this network needs CTC-like decoding, dropping duplicated labels. - // The decoder always drops the null character. - bool using_ctc = 10; - // Name of input image tensor. - string image_input = 11; - // Name of image height and width tensors. - string image_widths = 12; - string image_heights = 13; - // Name of output (softmax) tensor. - string output_layer = 14; -} diff --git a/src/tesseract.cpp b/src/tesseract.cpp index e8bc46516c..900833b8df 100644 --- a/src/tesseract.cpp +++ b/src/tesseract.cpp @@ -116,6 +116,9 @@ static void PrintVersionInfo() { #if defined(HAVE_NEON) || defined(__aarch64__) if (tesseract::SIMDDetect::IsNEONAvailable()) tprintInfo(" Found NEON\n"); +#elif defined(HAVE_RVV) + if (tesseract::SIMDDetect::IsRVVAvailable()) + printf(" Found RVV\n"); #else if (tesseract::SIMDDetect::IsAVX512BWAvailable()) { tprintInfo(" Found AVX512BW\n"); @@ -156,44 +159,42 @@ static void PrintVersionInfo() { } static void PrintHelpForPSM() { - const char *msg = - "Page segmentation modes:\n" - " 0 Orientation and script detection (OSD) only.\n" - " 1 Automatic page segmentation with OSD.\n" - " 2 Automatic page segmentation, but no OSD, nor OCR.\n" - " 3 Fully automatic page segmentation, but no OSD. (Default)\n" - " 4 Assume a single column of text of variable sizes.\n" - " 5 Assume a single uniform block of vertically aligned text.\n" - " 6 Assume a single uniform block of text.\n" - " 7 Treat the image as a single text line.\n" - " 8 Treat the image as a single word.\n" - " 9 Treat the image as a single word in a circle.\n" - " 10 Treat the image as a single character.\n" - " 11 Sparse text. Find as much text as possible in no particular order.\n" - " 12 Sparse text with OSD.\n" - " 13 Raw line. Treat the image as a single text line,\n" - " bypassing hacks that are Tesseract-specific.\n" - "\n"; - -#if DISABLED_LEGACY_ENGINE - const char *disabled_osd_msg = "\nNOTE: The OSD modes are currently disabled.\n"; - tprintInfo("{}{}", msg, disabled_osd_msg); -#else - tprintInfo("{}", msg); + tprintInfo( + "Page segmentation modes (PSM):\n" + " 0|osd_only Orientation and script detection (OSD) only.\n" + " 1|auto_osd Automatic page segmentation with OSD.\n" + " 2|auto_only Automatic page segmentation, but no OSD, nor OCR. (not " + "implemented)\n" + " 3|auto Fully automatic page segmentation, but no OSD. (Default)\n" + " 4|single_column Assume a single column of text of variable sizes.\n" + " 5|single_block_vert_text Assume a single uniform block of vertically aligned text.\n" + " 6|single_block Assume a single uniform block of text.\n" + " 7|single_line Treat the image as a single text line.\n" + " 8|single_word Treat the image as a single word.\n" + " 9|circle_word Treat the image as a single word in a circle.\n" + " 10|single_char Treat the image as a single character.\n" + " 11|sparse_text Sparse text. Find as much text as possible in no" + " particular order.\n" + " 12|sparse_text_osd Sparse text with OSD.\n" + " 13|raw_line Raw line. Treat the image as a single text line,\n" + " bypassing hacks that are Tesseract-specific.\n" + "\n"); + +#ifdef DISABLED_LEGACY_ENGINE + tprintInfo("\nNOTE: The OSD modes are currently disabled.\n"); #endif } #if !DISABLED_LEGACY_ENGINE static void PrintHelpForOEM() { - const char *msg = - "OCR Engine modes:\n" - " 0 Legacy engine only.\n" - " 1 Neural nets LSTM engine only.\n" - " 2 Legacy + LSTM engines.\n" - " 3 Default, based on what is available.\n" - "\n"; - - tprintInfo("{}", msg); + tprintInfo( + "OCR Engine modes (OEM):\n" + " 0|tesseract_only Legacy engine only.\n" + " 1|lstm_only Neural nets LSTM engine only.\n" + " 2|tesseract_lstm_combined Legacy + LSTM engines.\n" + " 3|default Default, based on what is available.\n" + "\n"); + ); } #endif // !DISABLED_LEGACY_ENGINE @@ -231,9 +232,9 @@ static void PrintHelpExtra(const char *program) { " -l LANG[+LANG] Specify language(s) used for OCR.\n" " -c VAR=VALUE Set value for config variables.\n" " Multiple -c arguments are allowed.\n" - " --psm NUM Specify page segmentation mode.\n" + " --psm PSM|NUM Specify page segmentation mode.\n" #if !DISABLED_LEGACY_ENGINE - " --oem NUM Specify OCR Engine mode.\n" + " --oem OEM|NUM Specify OCR Engine mode.\n" #endif " --visible-pdf-image PATH\n" " Specify path to source page image which will be\n" @@ -413,6 +414,57 @@ static void FixPageSegMode(tesseract::TessBaseAPI &api, tesseract::PageSegMode p } } +// Convert a symbolic or numeric string to an OEM value. +static int stringToOEM(const std::string arg) { + std::map oem_map = { + {"0", 0}, + {"1", 1}, + {"2", 2}, + {"3", 3}, + {"tesseract_only", 0}, + {"lstm_only", 1}, + {"tesseract_lstm_combined", 2}, + {"default", 3}, + }; + auto it = oem_map.find(arg); + return it == oem_map.end() ? -1 : it->second; +} + +static int stringToPSM(const std::string arg) { + std::map psm_map = { + {"0", 0}, + {"1", 1}, + {"2", 2}, + {"3", 3}, + {"4", 4}, + {"5", 5}, + {"6", 6}, + {"7", 7}, + {"8", 8}, + {"9", 9}, + {"10", 10}, + {"11", 11}, + {"12", 12}, + {"13", 13}, + {"osd_only", 0}, + {"auto_osd", 1}, + {"auto_only", 2}, + {"auto", 3}, + {"single_column", 4}, + {"single_block_vert_text", 5}, + {"single_block", 6}, + {"single_line", 7}, + {"single_word", 8}, + {"circle_word", 9}, + {"single_char", 10}, + {"sparse_text", 11}, + {"sparse_text_osd", 12}, + {"raw_line", 13}, + }; + auto it = psm_map.find(arg); + return it == psm_map.end() ? -1 : it->second; +} + static void InfoTraineddata(const std::vector &filenames) { for (const std::string &filename : filenames) { tesseract::TessdataManager mgr; @@ -627,12 +679,30 @@ static int ParseArgs(int argc, const char** argv, state |= PARSED_CONFIG_FILESET; continue; } else if (strcmp(argv[i], "--psm") == 0) { + if (i + 1 >= argc) { + tprintError("Command line option '{}' is given without any value to assign.\n", argv[i]); + return false; + } + int psm = stringToPSM(argv[i + 1]); + if (!checkArgValues(psm, "PSM", tesseract::PSM_COUNT)) { + return false; + } vars_vec->push_back("page_segmenting_mode"); // [i_a] NEW :: tessedit_pageseg_mode - PUSH_VALUE_OR_YAK(); + vars_values->push_back(value); + ++i; continue; } else if (strcmp(argv[i], "--oem") == 0) { + if (i + 1 >= argc) { + tprintError("Command line option '{}' is given without any value to assign.\n", argv[i]); + return false; + } + int oem = stringToOEM(argv[i + 1]); + if (!checkArgValues(oem, "OEM", tesseract::OEM_COUNT)) { + return false; + } vars_vec->push_back("engine_mode"); // [i_a] NEW :: tessedit_ocr_engine_mode - PUSH_VALUE_OR_YAK(); + vars_values->push_back(value); + ++i; continue; } else if (strcmp(verb, "--print-parameters") == 0) { cmd |= PRINT_PARAMETERS; @@ -648,17 +718,18 @@ static int ParseArgs(int argc, const char** argv, tprintError("Command line option '-c' is given without a parameter=value assignment following.\n"); return false; } - const char *var_stmt = argv[i + 1]; - ++i; - - const char *p = strchr(var_stmt, '='); - if (!p) { - tprintError("Missing '=' in '-c' configvar assignment statement: '{}'\n", var_stmt); + const std::string argument(argv[i + 1]); + const auto equal_pos = argument.find('='); + if (equal_pos == std::string::npos) { + tprintError("Missing '=' in '-c' configvar assignment statement: '{}'\n", argument); return false; } - std::string name(var_stmt, p - var_stmt); - Param *v = vars_vec->find(name.c_str(), ANY_TYPE_PARAM); - v->set_value(p + 1); + // Extract key and value + const std::string key = argument.substr(0, equal_pos); + const std::string value = argument.substr(equal_pos + 1); + vars_vec->push_back(key); + vars_values->push_back(value); + ++i; continue; } else if (strcmp(argv[i], "--source-image") == 0) { vars_vec->push_back("source_image"); // [i_a] NEW @@ -919,11 +990,7 @@ static bool PreloadRenderers(tesseract::TessBaseAPI &api, * **********************************************************************/ -#if defined(TESSERACT_STANDALONE) && !defined(BUILD_MONOLITHIC) -extern "C" int main(int argc, const char** argv) -#else -extern "C" int tesseract_main(int argc, const char **argv) -#endif +static int main1(int argc, const char **argv) { #if defined(__USE_GNU) && defined(HAVE_FEENABLEEXCEPT) // Raise SIGFPE. @@ -1412,3 +1479,18 @@ extern "C" int tesseract_main(int argc, const char **argv) return ret_val; } +#if defined(TESSERACT_STANDALONE) && !defined(BUILD_MONOLITHIC) +extern "C" int main(int argc, const char** argv) +#else +extern "C" int tesseract_main(int argc, const char **argv) +#endif +{ + try { + return main1(argc, argv); + } catch (std::exception &e) { + std::cerr << "exception: " << e.what() << "\n"; + } catch (...) { + std::cerr << "unknown exception\n"; + } + return 1; +} diff --git a/src/textord/baselinedetect.cpp b/src/textord/baselinedetect.cpp index 1f6d575fbf..59889b9558 100644 --- a/src/textord/baselinedetect.cpp +++ b/src/textord/baselinedetect.cpp @@ -30,6 +30,7 @@ #include "helpers.h" #include "linlsq.h" #include "makerow.h" +#include "tesserrstream.h" // for tesserr #include "textord.h" #include #include "underlin.h" diff --git a/src/textord/bbgrid.h b/src/textord/bbgrid.h index e4da28d4d3..6b64caa15b 100644 --- a/src/textord/bbgrid.h +++ b/src/textord/bbgrid.h @@ -372,10 +372,7 @@ class GridSearch { // Sort function to sort a BBC by bounding_box().left(). template -int SortByBoxLeft(const void *void1, const void *void2) { - // The void*s are actually doubly indirected, so get rid of one level. - const BBC *p1 = *static_cast(void1); - const BBC *p2 = *static_cast(void2); +int SortByBoxLeft(const BBC *p1, const BBC *p2) { int result = p1->bounding_box().left() - p2->bounding_box().left(); if (result != 0) { return result; @@ -392,10 +389,7 @@ int SortByBoxLeft(const void *void1, const void *void2) { } template -bool StdSortByBoxLeft(const void *void1, const void *void2) { - // The void*s are actually doubly indirected, so get rid of one level. - const BBC *p1 = *static_cast(void1); - const BBC *p2 = *static_cast(void2); +bool StdSortByBoxLeft(const BBC *p1, const BBC *p2) { int result = p1->bounding_box().left() - p2->bounding_box().left(); if (result != 0) { return result < 0; @@ -413,10 +407,7 @@ bool StdSortByBoxLeft(const void *void1, const void *void2) { // Sort function to sort a BBC by bounding_box().right() in right-to-left order. template -int SortRightToLeft(const void *void1, const void *void2) { - // The void*s are actually doubly indirected, so get rid of one level. - const BBC *p1 = *static_cast(void1); - const BBC *p2 = *static_cast(void2); +int SortRightToLeft(const BBC *p1, const BBC *p2) { int result = p2->bounding_box().right() - p1->bounding_box().right(); if (result != 0) { return result; @@ -433,10 +424,7 @@ int SortRightToLeft(const void *void1, const void *void2) { } template -bool StdSortRightToLeft(const void *void1, const void *void2) { - // The void*s are actually doubly indirected, so get rid of one level. - const BBC *p1 = *static_cast(void1); - const BBC *p2 = *static_cast(void2); +bool StdSortRightToLeft(const BBC *p1, const BBC *p2) { int result = p2->bounding_box().right() - p1->bounding_box().right(); if (result != 0) { return result < 0; @@ -454,10 +442,7 @@ bool StdSortRightToLeft(const void *void1, const void *void2) { // Sort function to sort a BBC by bounding_box().bottom(). template -int SortByBoxBottom(const void *void1, const void *void2) { - // The void*s are actually doubly indirected, so get rid of one level. - const BBC *p1 = *static_cast(void1); - const BBC *p2 = *static_cast(void2); +int SortByBoxBottom(const BBC *p1, const BBC *p2) { int result = p1->bounding_box().bottom() - p2->bounding_box().bottom(); if (result != 0) { return result; diff --git a/src/textord/blkocc.cpp b/src/textord/blkocc.cpp index 12482128e2..910fa857c2 100644 --- a/src/textord/blkocc.cpp +++ b/src/textord/blkocc.cpp @@ -126,7 +126,7 @@ static void horizontal_cblob_projection( // project outlines /** * horizontal_coutline_projection * - * Compute the horizontal projection of a outline from its outlines + * Compute the horizontal projection of an outline from its outlines * and add to the given STATS. */ diff --git a/src/textord/blkocc.h b/src/textord/blkocc.h index aedc308e45..0e1a53d02c 100644 --- a/src/textord/blkocc.h +++ b/src/textord/blkocc.h @@ -44,7 +44,7 @@ CLASS REGION_OCC ****************************************************************************/ -class REGION_OCC : public ELIST_LINK { +class REGION_OCC : public ELIST::LINK { public: float min_x; // Lowest x in region float max_x; // Highest x in region diff --git a/src/textord/colpartition.h b/src/textord/colpartition.h index dceaf16454..a07bfe5549 100644 --- a/src/textord/colpartition.h +++ b/src/textord/colpartition.h @@ -70,7 +70,7 @@ CLISTIZEH(ColPartition); * to a given y-coordinate range, eventually, a ColPartitionSet of ColPartitions * emerges, which represents the columns over a wide y-coordinate range. */ -class TESS_API ColPartition : public ELIST2_LINK { +class TESS_API ColPartition : public ELIST2::LINK { public: // This empty constructor is here only so that the class can be ELISTIZED. // TODO(rays) change deep_copy in elst.h line 955 to take a callback copier @@ -720,9 +720,7 @@ class TESS_API ColPartition : public ELIST2_LINK { bool IsInSameColumnAs(const ColPartition &part) const; // Sort function to sort by bounding box. - static int SortByBBox(const void *p1, const void *p2) { - const ColPartition *part1 = *static_cast(p1); - const ColPartition *part2 = *static_cast(p2); + static int SortByBBox(const ColPartition *part1, const ColPartition *part2) { int mid_y1 = part1->bounding_box_.y_middle(); int mid_y2 = part2->bounding_box_.y_middle(); if ((part2->bounding_box_.bottom() <= mid_y1 && diff --git a/src/textord/colpartitionset.h b/src/textord/colpartitionset.h index 2d05770411..0e384cbbbf 100644 --- a/src/textord/colpartitionset.h +++ b/src/textord/colpartitionset.h @@ -36,7 +36,7 @@ using PartSetVector = std::vector; // Its main use is in holding a candidate partitioning of the width of the // image into columns, where each member ColPartition is a single column. // ColPartitionSets are used in building the column layout of a page. -class ColPartitionSet : public ELIST_LINK { +class ColPartitionSet : public ELIST::LINK { public: ColPartitionSet() = default; explicit ColPartitionSet(ColPartition_LIST *partitions); diff --git a/src/textord/devanagari_processing.cpp b/src/textord/devanagari_processing.cpp index b6384274f1..177205fac5 100644 --- a/src/textord/devanagari_processing.cpp +++ b/src/textord/devanagari_processing.cpp @@ -38,17 +38,18 @@ INT_VAR(devanagari_split_debuglevel, 0, "Debug level for split shiro-rekha proce BOOL_VAR(devanagari_split_debugimage, 0, "Whether to create a debug image for split shiro-rekha process."); -ShiroRekhaSplitter::ShiroRekhaSplitter(Tesseract* tess) - : tesseract_(tess) { +ShiroRekhaSplitter::ShiroRekhaSplitter(Tesseract* tess) : + tesseract_(tess), + orig_pix_(nullptr), + splitted_image_(nullptr), + pageseg_split_strategy_(NO_SPLIT), + ocr_split_strategy_(NO_SPLIT), + debug_image_(nullptr), + segmentation_block_list_(nullptr), + global_xheight_(kUnspecifiedXheight), + perform_close_(false) +{ ASSERT0(tess != nullptr); - orig_pix_ = nullptr; - segmentation_block_list_ = nullptr; - splitted_image_ = nullptr; - global_xheight_ = kUnspecifiedXheight; - perform_close_ = false; - debug_image_ = nullptr; - pageseg_split_strategy_ = NO_SPLIT; - ocr_split_strategy_ = NO_SPLIT; } ShiroRekhaSplitter::~ShiroRekhaSplitter() { diff --git a/src/textord/fpchop.h b/src/textord/fpchop.h index b7e15f29db..8167d09cf4 100644 --- a/src/textord/fpchop.h +++ b/src/textord/fpchop.h @@ -25,7 +25,7 @@ namespace tesseract { -class C_OUTLINE_FRAG : public ELIST_LINK { +class C_OUTLINE_FRAG : public ELIST::LINK { public: C_OUTLINE_FRAG() { // empty constructor steps = nullptr; diff --git a/src/textord/makerow.cpp b/src/textord/makerow.cpp index d9aef8d32a..b45e3ba1e7 100644 --- a/src/textord/makerow.cpp +++ b/src/textord/makerow.cpp @@ -111,13 +111,8 @@ FZ_HEAPDBG_TRACKER_SECTION_END_MARKER(_) * Sort function to sort rows in y from page top. */ static int row_y_order( // sort function - const void *item1, // items to compare - const void *item2) { - // converted ptr - const TO_ROW *row1 = *reinterpret_cast(item1); - // converted ptr - const TO_ROW *row2 = *reinterpret_cast(item2); - + const TO_ROW *row1, // items to compare + const TO_ROW *row2) { if (row1->parallel_c() > row2->parallel_c()) { return -1; } else if (row1->parallel_c() < row2->parallel_c()) { @@ -2532,13 +2527,8 @@ OVERLAP_STATE most_overlapping_row( // find best row * Sort function to sort blobs in x from page left. */ int blob_x_order( // sort function - const void *item1, // items to compare - const void *item2) { - // converted ptr - const BLOBNBOX *blob1 = *reinterpret_cast(item1); - // converted ptr - const BLOBNBOX *blob2 = *reinterpret_cast(item2); - + const BLOBNBOX *blob1, // items to compare + const BLOBNBOX *blob2) { if (blob1->bounding_box().left() < blob2->bounding_box().left()) { return -1; } else if (blob1->bounding_box().left() > blob2->bounding_box().left()) { diff --git a/src/textord/makerow.h b/src/textord/makerow.h index c01086f5c8..386c417119 100644 --- a/src/textord/makerow.h +++ b/src/textord/makerow.h @@ -244,8 +244,8 @@ OVERLAP_STATE most_overlapping_row(TO_ROW_IT *row_it, // iterator bool testing_blob // test stuff ); int blob_x_order( // sort function - const void *item1, // items to compare - const void *item2); + const BLOBNBOX *item1, // items to compare + const BLOBNBOX *item2); void mark_repeated_chars(TO_ROW *row); diff --git a/src/textord/pithsync.cpp b/src/textord/pithsync.cpp index 488977ab73..a4c09eb187 100644 --- a/src/textord/pithsync.cpp +++ b/src/textord/pithsync.cpp @@ -122,6 +122,7 @@ void FPCUTPT::assign( // constructor // half of pitch int16_t half_pitch = pitch / 2 - 1; uint32_t lead_flag; // new flag + float inv_projection_scale = 1.0f / projection_scale; if (half_pitch > 31) { half_pitch = 31; @@ -167,7 +168,7 @@ void FPCUTPT::assign( // constructor (projection->pile_count(x - balance_index) <= zero_count); } } - balance_count = static_cast(balance_count * textord_balance_factor / projection_scale); + balance_count = static_cast(balance_count * textord_balance_factor * inv_projection_scale); } r_index = segpt->region_index + 1; total = segpt->mean_sum + dist; @@ -222,6 +223,7 @@ void FPCUTPT::assign_cheap( // constructor // half of pitch int16_t half_pitch = pitch / 2 - 1; uint32_t lead_flag; // new flag + float inv_projection_scale = 1.0f / projection_scale; if (half_pitch > 31) { half_pitch = 31; @@ -260,7 +262,7 @@ void FPCUTPT::assign_cheap( // constructor balance_count++; lead_flag &= lead_flag - 1; } - balance_count = static_cast(balance_count * textord_balance_factor / projection_scale); + balance_count = static_cast(balance_count * textord_balance_factor * inv_projection_scale); } r_index = segpt->region_index + 1; total = segpt->mean_sum + dist; @@ -513,6 +515,7 @@ double check_pitch_sync3( // find segmentation int16_t best_fake; // best fake level int16_t best_count; // no of cuts FPSEGPT_IT seg_it = seg_list; // output iterator + float inv_projection_scale = 1.0f / projection_scale; end = (end - start) % pitch; if (pitch < 3) { @@ -599,7 +602,7 @@ double check_pitch_sync3( // find segmentation offset = projection->pile_count(x); faking = true; } else { - projection_offset = static_cast(projection->pile_count(x) / projection_scale); + projection_offset = static_cast(projection->pile_count(x) * inv_projection_scale); if (projection_offset > offset) { offset = projection_offset; } diff --git a/src/textord/pitsync1.h b/src/textord/pitsync1.h index d9815159e2..6cc2c71c1f 100644 --- a/src/textord/pitsync1.h +++ b/src/textord/pitsync1.h @@ -31,7 +31,7 @@ namespace tesseract { class FPSEGPT_LIST; -class FPSEGPT : public ELIST_LINK { +class FPSEGPT : public ELIST::LINK { public: FPSEGPT() = default; FPSEGPT( // constructor diff --git a/src/textord/sortflts.h b/src/textord/sortflts.h index 873f847403..5b6a9b3d5b 100644 --- a/src/textord/sortflts.h +++ b/src/textord/sortflts.h @@ -23,7 +23,7 @@ namespace tesseract { -class SORTED_FLOAT : public ELIST_LINK { +class SORTED_FLOAT : public ELIST::LINK { friend class SORTED_FLOATS; public: diff --git a/src/textord/tabfind.cpp b/src/textord/tabfind.cpp index 71ba25995a..e0a144d429 100644 --- a/src/textord/tabfind.cpp +++ b/src/textord/tabfind.cpp @@ -66,8 +66,8 @@ TabFind::TabFind(Tesseract* tess, int gridsize, const ICOORD &bleft, const ICOOR : AlignedBlob(tess, gridsize, bleft, tright) , resolution_(resolution) , image_origin_(0, tright.y() - 1) - , v_it_(&vectors_) { - width_cb_ = nullptr; + , v_it_(&vectors_) + , width_cb_(nullptr) { v_it_.add_list_after(vlines); SetVerticalSkewAndParallelize(vertical_x, vertical_y); using namespace std::placeholders; // for _1 diff --git a/src/textord/tablefind.cpp b/src/textord/tablefind.cpp index dbce903977..afb927c3c8 100644 --- a/src/textord/tablefind.cpp +++ b/src/textord/tablefind.cpp @@ -2149,7 +2149,7 @@ void TableFinder::MakeTableBlocks(ColPartitionGrid *grid, //////// ColSegment code //////// ColSegment::ColSegment() - : ELIST_LINK(), + : ELIST::LINK(), num_table_cells_(0), num_text_cells_(0), type_(COL_UNKNOWN) {} diff --git a/src/textord/tablefind.h b/src/textord/tablefind.h index 7b8c0a9471..5edfacbe00 100644 --- a/src/textord/tablefind.h +++ b/src/textord/tablefind.h @@ -39,7 +39,7 @@ class ColSegment; ELISTIZEH(ColSegment); CLISTIZEH(ColSegment); -class ColSegment : public ELIST_LINK { +class ColSegment : public ELIST::LINK { public: ColSegment(); ~ColSegment() = default; diff --git a/src/textord/tabvector.h b/src/textord/tabvector.h index 0fab66bbfb..ab6e9987c9 100644 --- a/src/textord/tabvector.h +++ b/src/textord/tabvector.h @@ -67,7 +67,7 @@ ELISTIZEH(TabConstraint); // on a list of constraints. The list itself is cooperatively owned // by the TabVectors of the constraints on the list and managed // by implicit reference counting via the elements of the list. -class TabConstraint : public ELIST_LINK { +class TabConstraint : public ELIST::LINK { public: // This empty constructor is here only so that the class can be ELISTIZED. // TODO(rays) change deep_copy in elst.h line 955 to take a callback copier @@ -105,7 +105,7 @@ class TabConstraint : public ELIST_LINK { // Class to hold information about a single vector // that represents a tab stop or a rule line. -class TabVector : public ELIST2_LINK { +class TabVector : public ELIST2::LINK { public: // TODO(rays) fix this in elst.h line 1076, where it should use the // copy constructor instead of operator=. @@ -289,9 +289,7 @@ class TabVector : public ELIST2_LINK { } // Sort function for E2LIST::sort to sort by sort_key_. - static int SortVectorsByKey(const void *v1, const void *v2) { - const TabVector *tv1 = *static_cast(v1); - const TabVector *tv2 = *static_cast(v2); + static int SortVectorsByKey(const TabVector *tv1, const TabVector *tv2) { return tv1->sort_key_ - tv2->sort_key_; } diff --git a/src/textord/wordseg.cpp b/src/textord/wordseg.cpp index ca226a794a..6c096cd780 100644 --- a/src/textord/wordseg.cpp +++ b/src/textord/wordseg.cpp @@ -89,6 +89,7 @@ void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST *real_rows) { word->set_flag(W_EOL, true); word->set_flag(W_DONT_CHOP, one_blob); word_it.add_after_then_move(word); + real_row->recalc_bounding_box(); row_it.add_after_then_move(real_row); } } diff --git a/src/textord/workingpartset.h b/src/textord/workingpartset.h index 129a42eb44..cdd5c1fda6 100644 --- a/src/textord/workingpartset.h +++ b/src/textord/workingpartset.h @@ -30,7 +30,7 @@ namespace tesseract { // WorkingPartSet holds a working set of ColPartitions during transformation // from the grid-based storage to regions in logical reading order, and is // therefore only used during construction of the regions. -class WorkingPartSet : public ELIST_LINK { +class WorkingPartSet : public ELIST::LINK { public: explicit WorkingPartSet(ColPartition *column) : column_(column), latest_part_(nullptr), part_it_(&part_set_) {} diff --git a/src/training/CMakeLists.txt b/src/training/CMakeLists.txt index 7fbf02120e..c764442070 100644 --- a/src/training/CMakeLists.txt +++ b/src/training/CMakeLists.txt @@ -126,7 +126,7 @@ install( ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) generate_export_header(common_training EXPORT_MACRO_NAME TESS_COMMON_TRAINING_API) -if (MSVC) +if (MSVC AND BUILD_SHARED_LIBS) install(FILES $ DESTINATION bin OPTIONAL) endif() project_group(common_training "Training Tools") @@ -298,7 +298,7 @@ if(ICU_FOUND) RUNTIME DESTINATION bin LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) - if (MSVC) + if (MSVC AND BUILD_SHARED_LIBS) install(FILES $ DESTINATION bin OPTIONAL) endif() generate_export_header(unicharset_training EXPORT_MACRO_NAME diff --git a/src/training/classifier_tester.cpp b/src/training/classifier_tester.cpp index 9234487de5..ddde1e5e57 100644 --- a/src/training/classifier_tester.cpp +++ b/src/training/classifier_tester.cpp @@ -47,13 +47,13 @@ static const char *names[] = {"pruner", "full"}; FZ_HEAPDBG_TRACKER_SECTION_END_MARKER(_) -static tesseract::ShapeClassifier *InitializeClassifier(const char *classifer_name, +static tesseract::ShapeClassifier *InitializeClassifier(tesseract::TessBaseAPI &api, const char *classifier_name, const UNICHARSET &unicharset, int argc, - const char **argv, tesseract::TessBaseAPI **api) { + const char **argv) { // Decode the classifier string. ClassifierName classifier = CN_COUNT; for (int c = 0; c < CN_COUNT; ++c) { - if (strcmp(classifer_name, names[c]) == 0) { + if (strcmp(classifier_name, names[c]) == 0) { classifier = static_cast(c); break; } @@ -64,17 +64,15 @@ static tesseract::ShapeClassifier *InitializeClassifier(const char *classifer_na } // We need to initialize tesseract to test. - *api = new tesseract::TessBaseAPI; tesseract::OcrEngineMode engine_mode = tesseract::OEM_TESSERACT_ONLY; tesseract::Tesseract *tesseract = nullptr; tesseract::Classify *classify = nullptr; if (classifier == CN_PRUNER || classifier == CN_FULL) { - tesseract::TessBaseAPI *tess = *api; - if (tess->Init(test_tessdata_dir.c_str(), test_lang.c_str(), engine_mode) < 0) { + if (api.InitOem(test_tessdata_dir.c_str(), test_lang.c_str(), engine_mode) < 0) { tprintError("Tesseract initialization failed!\n"); return nullptr; } - tesseract = &tess->tesseract(); + tesseract = &api.tesseract(); classify = static_cast(tesseract); if (classify->shape_table() == nullptr) { tprintError("Tesseract must contain a ShapeTable!\n"); @@ -88,7 +86,7 @@ static tesseract::ShapeClassifier *InitializeClassifier(const char *classifer_na } else if (classifier == CN_FULL) { shape_classifier = new tesseract::TessClassifier(false, classify); } - tprintDebug("Testing classifier {}:\n", classifer_name); + tprintDebug("Testing classifier {}:\n", classifier_name); return shape_classifier; } @@ -120,16 +118,17 @@ extern "C" TESS_API int tesseract_classifier_tester_main(int argc, const char** tesseract::CheckSharedLibraryVersion(); (void)tesseract::SetConsoleModeToUTF8(); - int rv = ParseArguments(&argc, &argv); + tesseract::TessBaseAPI api; + + int rv = ParseArguments(api, &argc, &argv); if (rv >= 0) { return rv; } std::string file_prefix; auto trainer = tesseract::LoadTrainingData(argv + 1, false, nullptr, file_prefix); - tesseract::TessBaseAPI *api; // Decode the classifier string. tesseract::ShapeClassifier *shape_classifier = - InitializeClassifier(test_classifier.c_str(), trainer->unicharset(), argc, argv, &api); + InitializeClassifier(api, test_classifier.c_str(), trainer->unicharset(), argc, argv); if (shape_classifier == nullptr) { tprintError("Classifier init failed!:{}\n", test_classifier.c_str()); return EXIT_FAILURE; @@ -144,7 +143,6 @@ extern "C" TESS_API int tesseract_classifier_tester_main(int argc, const char** test_report_level, false, shape_classifier, nullptr); delete shape_classifier; - delete api; return EXIT_SUCCESS; } /* main */ diff --git a/src/training/cntraining.cpp b/src/training/cntraining.cpp index 32145cc6fe..cc90cd3c34 100644 --- a/src/training/cntraining.cpp +++ b/src/training/cntraining.cpp @@ -116,6 +116,8 @@ extern "C" TESS_API int tesseract_cn_training_main(int argc, const char** argv) tesseract::CheckSharedLibraryVersion(); (void)tesseract::SetConsoleModeToUTF8(); + tesseract::TessBaseAPI api; + // Set the global Config parameters before parsing the command line. Config = CNConfig; @@ -129,7 +131,7 @@ extern "C" TESS_API int tesseract_cn_training_main(int argc, const char** argv) FEATURE_DEFS_STRUCT FeatureDefs; InitFeatureDefs(&FeatureDefs); - rv = ParseArguments(&argc, &argv); + rv = ParseArguments(api, &argc, &argv); if (rv >= 0) { return rv; } diff --git a/src/training/combine_lang_model.cpp b/src/training/combine_lang_model.cpp index ad68d59809..83392871ed 100644 --- a/src/training/combine_lang_model.cpp +++ b/src/training/combine_lang_model.cpp @@ -21,6 +21,7 @@ #include "common/commandlineflags.h" #include "common/commontraining.h" // CheckSharedLibraryVersion #include "unicharset/lang_model_helpers.h" +#include "tesserrstream.h" // for tesserr #include #include "unicharset/unicharset_training_utils.h" diff --git a/src/training/combine_tessdata.cpp b/src/training/combine_tessdata.cpp index 7db9711324..c6cc30029a 100644 --- a/src/training/combine_tessdata.cpp +++ b/src/training/combine_tessdata.cpp @@ -39,14 +39,14 @@ static int list_components(TessdataManager &tm, const char *filename) { return EXIT_SUCCESS; } -static int list_network(TessdataManager &tm, const char *filename, int tess_debug_lstm) { +static int list_network(TessBaseAPI &api, TessdataManager &tm, const char *filename, int tess_debug_lstm) { if (filename != nullptr && !tm.Init(filename)) { tprintError("Failed to read {}\n", filename); return EXIT_FAILURE; } tesseract::TFile fp; if (tm.GetComponent(tesseract::TESSDATA_LSTM, &fp)) { - tesseract::LSTMRecognizer recognizer(nullptr); + tesseract::LSTMRecognizer recognizer(api.tesseract()); recognizer.SetDebug(tess_debug_lstm); if (!recognizer.DeSerialize(&tm, &fp)) { tprintError("Failed to deserialize LSTM in {}!\n", filename); @@ -191,153 +191,171 @@ extern "C" int tesseract_combine_tessdata_main(int argc, const char** argv) } tesseract::TessdataManager tm; - - if (argc == 2) { - tprintDebug("Combining tessdata files\n"); - std::string lang = argv[1]; - const char* last = &argv[1][strlen(argv[1]) - 1]; - if (*last != '.') { - lang += '.'; - } - std::string output_file = lang; - output_file += kTrainedDataSuffix; - if (!tm.CombineDataFiles(lang.c_str(), output_file.c_str())) { - tprintError("Error combining tessdata files into {}\n", output_file); - } - else { - tprintDebug("Output {} created successfully.\n", output_file); - } + tesseract::TessBaseAPI api; + if (argc > 1 && (!strcmp(argv[1], "-v") || !strcmp(argv[1], "--version"))) { + tprintInfo("{}\n", tesseract::TessBaseAPI::Version()); + return EXIT_SUCCESS; + } else if (argc == 2) { + tprintDebug("Combining tessdata files\n"); + std::string lang = argv[1]; + const char *last = &argv[1][strlen(argv[1])-1]; + if (*last != '.') { + lang += '.'; } - else if (argc >= 4 && - (strcmp(argv[1], "-e") == 0 || strcmp(argv[1], "-u") == 0)) { - // Initialize TessdataManager with the data in the given traineddata file. - if (!tm.Init(argv[2])) { - tprintError("Failed to read {}\n", argv[2]); - return EXIT_FAILURE; - } - tprintDebug("Extracting tessdata components from {}\n", argv[2]); - if (strcmp(argv[1], "-e") == 0) { - for (int i = 3; i < argc; ++i) { - errno = 0; - if (tm.ExtractToFile(argv[i])) { - tprintDebug("Wrote {}\n", argv[i]); - } - else if (errno == 0) { - tprintError("Not extracting {}, since this component" - " is not present\n", - argv[i]); - return EXIT_FAILURE; - } - else { - tprintError("Could not extract {}: {}\n", argv[i], strerror(errno)); - return EXIT_FAILURE; - } + std::string output_file = lang; + output_file += "traineddata"; + if (!tm.CombineDataFiles(lang.c_str(), output_file.c_str())) { + tprintError("Error combining tessdata files into {}\n", output_file); + } else { + tprintDebug("Output {} created successfully.\n", output_file); + } + } else if (argc >= 4 && + (strcmp(argv[1], "-e") == 0 || strcmp(argv[1], "-u") == 0)) { + // Initialize TessdataManager with the data in the given traineddata file. + if (!tm.Init(argv[2])) { + tprintError("Failed to read {}\n", argv[2]); + return EXIT_FAILURE; + } + tprintDebug("Extracting tessdata components from {}\n", argv[2]); + if (strcmp(argv[1], "-e") == 0) { + for (i = 3; i < argc; ++i) { + errno = 0; + if (tm.ExtractToFile(argv[i])) { + tprintDebug("Wrote {}\n", argv[i]); + } else if (errno == 0) { + tprintError("Not extracting {}, since this component" + " is not present\n", + argv[i]); + return EXIT_FAILURE; + } else { + tprintError("Could not extract {}: {}\n", argv[i], strerror(errno)); + return EXIT_FAILURE; } } - else { // extract all the components - for (int i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) { - std::string filename = argv[3]; - const char* last = &argv[3][strlen(argv[3]) - 1]; - if (*last != '.') { - filename += '.'; - } - filename += tesseract::kTessdataFileSuffixes[i]; - errno = 0; - if (tm.ExtractToFile(filename.c_str())) { - tprintDebug("Wrote {}\n", filename); - } - else if (errno != 0) { - tprintError("Could not extract {}: {}\n", filename, - strerror(errno)); - return EXIT_FAILURE; - } + } else { // extract all the components + for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) { + std::string filename = argv[3]; + const char *last = &argv[3][strlen(argv[3])-1]; + if (*last != '.') { + filename += '.'; + } + filename += tesseract::kTessdataFileSuffixes[i]; + errno = 0; + if (tm.ExtractToFile(filename.c_str())) { + tprintDebug("Wrote {}\n", filename); + } else if (errno != 0) { + tprintError("Could not extract {}: {}\n", filename, + strerror(errno)); + return EXIT_FAILURE; } } } - else if (argc >= 4 && strcmp(argv[1], "-o") == 0) { - // Rename the current traineddata file to a temporary name. - const char* new_traineddata_filename = argv[2]; - std::string traineddata_filename = new_traineddata_filename; - traineddata_filename += ".__tmp__"; - if (rename(new_traineddata_filename, traineddata_filename.c_str()) != 0) { - tprintError("Failed to create a temporary file {}\n", - traineddata_filename); - return EXIT_FAILURE; - } + } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) { + // Rename the current traineddata file to a temporary name. + const char *new_traineddata_filename = argv[2]; + std::string traineddata_filename = new_traineddata_filename; + traineddata_filename += ".__tmp__"; + if (rename(new_traineddata_filename, traineddata_filename.c_str()) != 0) { + tprintError("Failed to create a temporary file {}\n", + traineddata_filename); + return EXIT_FAILURE; + } - // Initialize TessdataManager with the data in the given traineddata file. - tm.Init(traineddata_filename.c_str()); + // Initialize TessdataManager with the data in the given traineddata file. + tm.Init(traineddata_filename.c_str()); - // Write the updated traineddata file. - tm.OverwriteComponents(new_traineddata_filename, argv + 3, argc - 3); - } - else if (argc == 3 && strcmp(argv[1], "-c") == 0) { - if (!tm.Init(argv[2])) { - tprintError("Failed to read {}\n", argv[2]); - return EXIT_FAILURE; - } - tesseract::TFile fp; - if (!tm.GetComponent(tesseract::TESSDATA_LSTM, &fp)) { - tprintError("No LSTM Component found in {}!\n", argv[2]); - return EXIT_FAILURE; - } - tesseract::LSTMRecognizer recognizer(nullptr); - recognizer.SetDebug(tess_debug_lstm); - if (!recognizer.DeSerialize(&tm, &fp)) { - tprintError("Failed to deserialize LSTM in {}!\n", argv[2]); - return EXIT_FAILURE; - } - recognizer.ConvertToInt(); - std::vector lstm_data; - fp.OpenWrite(&lstm_data); - ASSERT_HOST(recognizer.Serialize(&tm, &fp)); - tm.OverwriteEntry(tesseract::TESSDATA_LSTM, &lstm_data[0], - lstm_data.size()); - if (!tm.SaveFile(argv[2], nullptr)) { - tprintError("Failed to write modified traineddata:{}!\n", argv[2]); - return EXIT_FAILURE; - } - } - else if (argc == 3 && strcmp(argv[1], "-t") == 0) { -#if defined(HAVE_LIBARCHIVE) - if (!tm.Init(argv[2])) { - tprintError("Failed to read %s\n", argv[2]); - return EXIT_FAILURE; - } - if (!tm.SaveFile(argv[2], nullptr)) { - tprintError("Failed to tranform traineddata:%s!\n", argv[2]); - return EXIT_FAILURE; - } -#else - tprintError("Failed to load libarchive. Is tesseract compiled with libarchive support?\n"); -#endif + // Write the updated traineddata file. + tm.OverwriteComponents(new_traineddata_filename, argv + 3, argc - 3); + } else if (argc == 3 && strcmp(argv[1], "-c") == 0) { + if (!tm.Init(argv[2])) { + tprintError("Failed to read {}\n", argv[2]); + return EXIT_FAILURE; } - else if (argc == 3 && strcmp(argv[1], "-d") == 0) { - return list_components(tm, argv[2]); + tesseract::TFile fp; + if (!tm.GetComponent(tesseract::TESSDATA_LSTM, &fp)) { + tprintError("No LSTM Component found in {}!\n", argv[2]); + return EXIT_FAILURE; } - else if (argc == 3 && strcmp(argv[1], "-l") == 0) { - return list_network(tm, argv[2], tess_debug_lstm); + tesseract::LSTMRecognizer recognizer(api.tesseract()); + recognizer.SetDebug(tess_debug_lstm); + if (!recognizer.DeSerialize(&tm, &fp)) { + tprintError("Failed to deserialize LSTM in {}!\n", argv[2]); + return EXIT_FAILURE; } - else if (argc == 3 && strcmp(argv[1], "-dl") == 0) { - int result = list_components(tm, argv[2]); - if (result == EXIT_SUCCESS) { - result = list_network(tm, nullptr, tess_debug_lstm); - } - return result; + recognizer.ConvertToInt(); + std::vector lstm_data; + fp.OpenWrite(&lstm_data); + ASSERT_HOST(recognizer.Serialize(&tm, &fp)); + tm.OverwriteEntry(tesseract::TESSDATA_LSTM, &lstm_data[0], + lstm_data.size()); + if (!tm.SaveFile(argv[2], nullptr)) { + tprintError("Failed to write modified traineddata:{}!\n", argv[2]); + return EXIT_FAILURE; } - else if (argc == 3 && strcmp(argv[1], "-ld") == 0) { - int result = list_network(tm, argv[2], tess_debug_lstm); - if (result == EXIT_SUCCESS) { - result = list_components(tm, nullptr); - } - return result; + } else if (argc == 3 && strcmp(argv[1], "-d") == 0) { + return list_components(tm, argv[2]); + } else if (argc == 3 && strcmp(argv[1], "-l") == 0) { + return list_network(api, tm, argv[2], tess_debug_lstm); + } else if (argc == 3 && strcmp(argv[1], "-dl") == 0) { + int result = list_components(tm, argv[2]); + if (result == EXIT_SUCCESS) { + result = list_network(api, tm, nullptr, tess_debug_lstm); } - else { - tprintError("Unsupported command '{}' or bad number of arguments ({}).\n", argv[1], argc - 1); - argc = 1; - continue; + return result; + } else if (argc == 3 && strcmp(argv[1], "-ld") == 0) { + int result = list_network(api, tm, argv[2], tess_debug_lstm); + if (result == EXIT_SUCCESS) { + result = list_components(tm, nullptr); } - tm.Directory(); - return EXIT_SUCCESS; + return result; + } else { + const char* exename = fz_basename(argv[0]); + tprintInfo( + "Usage for combining tessdata components:\n" + " {} language_data_path_prefix\n" + " (e.g. {} tessdata/eng.)\n\n", + exename, exename); + tprintInfo( + "Usage for extracting tessdata components:\n" + " {} -e traineddata_file [output_component_file...]\n" + " (e.g. {} -e eng.traineddata eng.unicharset)\n\n", + exename, exename); + tprintInfo( + "Usage for overwriting tessdata components:\n" + " {} -o traineddata_file [input_component_file...]\n" + " (e.g. {} -o eng.traineddata eng.unicharset)\n\n", + exename, exename); + tprintInfo( + "Usage for unpacking all tessdata components:\n" + " {} -u traineddata_file output_path_prefix\n" + " (e.g. {} -u eng.traineddata tmp/eng.)\n\n", + exename, exename); + tprintInfo( + "Usage for listing the network information\n" + " {} -l traineddata_file\n" + " (e.g. {} -l eng.traineddata)\n\n", + exename, exename); + tprintInfo( + "Usage for listing directory of components:\n" + " {} -d traineddata_file\n\n", + exename); + tprintInfo( + "NOTE: Above two flags may combined as -dl or -ld to get both outputs.\n\n" + ); + tprintInfo( + "Usage for compacting LSTM component to int:\n" + " {} -c traineddata_file\n", + exename); + + + + + + + + + return EXIT_FAILURE; } + tm.Directory(); + return EXIT_SUCCESS; } diff --git a/src/training/common/commandlineflags.cpp b/src/training/common/commandlineflags.cpp index 65b3ed5bcb..e8a1a2656d 100644 --- a/src/training/common/commandlineflags.cpp +++ b/src/training/common/commandlineflags.cpp @@ -17,14 +17,56 @@ #include // for std::locale::classic #include // for std::stringstream #include // for std::vector +#include +#include #include "errcode.h" #include "helpers.h" #include +namespace fs = std::filesystem; + using namespace ::parameters; namespace tesseract { +static bool AnyFlagExists(const char *flag_name) { + std::string full_flag_name("FLAGS_"); + full_flag_name += flag_name; + { + std::vector empty; + auto *p = + ParamUtils::FindParam(full_flag_name.c_str(), GlobalParams()->int_params_c(), empty); + if (p) { + return true; + } + } + { + std::vector empty; + auto *p = + ParamUtils::FindParam(full_flag_name.c_str(), GlobalParams()->bool_params_c(), empty); + if (p) { + return true; + } + } + { + std::vector empty; + auto *p = + ParamUtils::FindParam(full_flag_name.c_str(), GlobalParams()->double_params_c(), empty); + if (p) { + return true; + } + } + { + std::vector empty; + auto *p = + ParamUtils::FindParam(full_flag_name.c_str(), GlobalParams()->string_params_c(), empty); + if (p) { + return true; + } + } + return false; +} + static void PrintCommandLineFlags() { const char *kFlagNamePrefix = "FLAGS_"; const int kFlagNamePrefixLen = strlen(kFlagNamePrefix); @@ -44,7 +86,16 @@ int ParseCommandLineFlags(const char *extra_usage, std::function 0 && argv[0]) ? fz_basename(argv[0]) : "???"); + + std::string appname; + if (argc > 0 && argv[0]) { + fs::path exename = argv[0]; + appname = exename.stem().string(); + extra_usage = boost::replace_all_copy(appname, "tesseract-", ""); + } + else { + appname = "???"; + } if (argc == 1) { tprintInfo("USAGE:\n {} -v | --version | {}\n", appname, extra_usage); @@ -107,49 +158,51 @@ int ParseCommandLineFlags(const char *extra_usage, std::function(full_flag_name.c_str(), GlobalParams()); - if (p == nullptr) { - // Flag was not found. Exit with an error message? - - // When the commandline option is a single character, it's probably - // an application specific command. Keep it. - if (lhs.length() == 1) { - break; - } - - tprintError("Non-existent flag '{}'\n", lhs); - return 1; - } + if (AnyFlagExists(lhs.c_str())) { + std::string full_flag_name("FLAGS_"); + full_flag_name += lhs; + auto *p = ParamUtils::FindParam(full_flag_name.c_str(), GlobalParams()); + if (p == nullptr) { + // Flag was not found. Exit with an error message? - // do not require rhs when parameter is the boolean type: - if (rhs == nullptr) { - // Pick the next argument - if (i + 1 >= argc) { - if (p->type() != BOOL_PARAM) { - tprintError("Could not find value for flag {}\n", lhs); - return 1; - } - else { - // --flag form - rhs = "true"; - } - } - else { - rhs = argv[++i]; - } - } - if (p->type() == BOOL_PARAM && strlen(rhs) == 0) { - // Bad input of the format --bool_flag= - tprintError("Bad boolean flag '{}' argument: '{}'\n", lhs, rhs); - return 1; - } + // When the commandline option is a single character, it's probably + // an application specific command. Keep it. + if (lhs.length() == 1) { + break; + } - p->set_value(rhs); - if (p->has_faulted()) { - tprintError("Could not parse value '{}' for flag '{}'\n", rhs, lhs); - return 1; + tprintError("Non-existent flag '{}'\n", lhs); + return 1; + } + + // do not require rhs when parameter is the boolean type: + if (rhs == nullptr) { + // Pick the next argument + if (i + 1 >= argc) { + if (p->type() != BOOL_PARAM) { + tprintError("Could not find value for flag {}\n", lhs); + return 1; + } + else { + // --flag form + rhs = "true"; + } + } + else { + rhs = argv[++i]; + } + } + if (p->type() == BOOL_PARAM && strlen(rhs) == 0) { + // Bad input of the format --bool_flag= + tprintError("Bad boolean flag '{}' argument: '{}'\n", lhs, rhs); + return 1; + } + + p->set_value(rhs); + if (p->has_faulted()) { + tprintError("Could not parse value '{}' for flag '{}'\n", rhs, lhs); + return 1; + } } } // for each argv if (remove_flags && i > 1) { @@ -157,6 +210,9 @@ int ParseCommandLineFlags(const char *extra_usage, std::function +namespace fs = std::filesystem; + #if DISABLED_LEGACY_ENGINE -# include -# include +#include +#include namespace tesseract { @@ -76,6 +79,7 @@ int ParseArguments(int* argc, const char ***argv) { # include "tessdatamanager.h" # include # include "unicity_table.h" +# include "tesseractclass.h" namespace tesseract { @@ -88,8 +92,6 @@ FZ_HEAPDBG_TRACKER_SECTION_START_MARKER(_) CLUSTERCONFIG Config = {elliptical, 0.625, 0.05, 1.0, 1e-6, 0}; FEATURE_DEFS_STRUCT feature_defs; -static CCUtil *ccutil = nullptr; - INT_VAR(trainer_debug_level, 0, "Level of Trainer debugging"); INT_VAR(trainer_load_images, 0, "Load images with tr files"); STRING_VAR(trainer_configfile, "", "File to load more configs from"); @@ -124,10 +126,7 @@ FZ_HEAPDBG_TRACKER_SECTION_END_MARKER(_) * @param argc number of command line arguments to parse * @param argv command line arguments */ -int ParseArguments(int *argc, const char ***argv) { - if (!ccutil) - ccutil = new CCUtil(); - +int ParseArguments(TessBaseAPI &api, int *argc, const char ***argv) { int rv = tesseract::ParseCommandLineFlags("[.tr files ...]", argc, argv); if (rv >= 0) return rv; @@ -139,8 +138,7 @@ int ParseArguments(int *argc, const char ***argv) { Config.Confidence = std::max(0.0, std::min(1.0, double(clusterconfig_confidence))); // Set additional parameters from config file if specified. if (!trainer_configfile.empty()) { - ASSERT0(ccutil != nullptr); - ParamUtils::ReadParamsFile(trainer_configfile, ccutil->params_collective(), nullptr, PARAM_VALUE_IS_SET_BY_CONFIGFILE); + tesseract::ParamUtils::ReadParamsFile(trainer_configfile.c_str(), tesseract::SET_PARAM_CONSTRAINT_NON_INIT_ONLY, api.tesseract().params()); } return rv; } diff --git a/src/training/common/commontraining.h b/src/training/common/commontraining.h index 5f675b6c74..600fe17242 100644 --- a/src/training/common/commontraining.h +++ b/src/training/common/commontraining.h @@ -28,7 +28,7 @@ namespace tesseract { TESS_COMMON_TRAINING_API -int ParseArguments(int* argc, const char ***argv); +int ParseArguments(TessBaseAPI &api, int *argc, const char ***argv); // Check whether the shared tesseract library is the right one. // This function must be inline because otherwise it would be part of diff --git a/src/training/common/errorcounter.cpp b/src/training/common/errorcounter.cpp index e44872c19b..9ea501bc45 100644 --- a/src/training/common/errorcounter.cpp +++ b/src/training/common/errorcounter.cpp @@ -21,6 +21,7 @@ #include "sampleiterator.h" #include "shapeclassifier.h" #include "shapetable.h" +#include "tesserrstream.h" #include "trainingsample.h" #include "trainingsampleset.h" #include "unicity_table.h" @@ -51,7 +52,9 @@ double ErrorCounter::ComputeErrorRate(ShapeClassifier *classifier, int report_le std::vector results; plf::nanotimer clock; - clock.start(); + if (report_level > 1) { + clock.start(); + } unsigned total_samples = 0; double unscaled_error = 0.0; // Set a number of samples on which to run the classify debug mode. @@ -59,7 +62,7 @@ double ErrorCounter::ComputeErrorRate(ShapeClassifier *classifier, int report_le // Iterate over all the samples, accumulating errors. for (it->Begin(); !it->AtEnd(); it->Next()) { TrainingSample *mutable_sample = it->MutableSample(); - int page_index = mutable_sample->page_num(); + size_t page_index = mutable_sample->page_num(); Image page_pix = 0 <= page_index && page_index < page_images.size() ? page_images[page_index] : nullptr; // No debug, no keep this. @@ -86,7 +89,6 @@ double ErrorCounter::ComputeErrorRate(ShapeClassifier *classifier, int report_le } ++total_samples; } - const double total_time = clock.get_elapsed_sec(); // Create the appropriate error report. unscaled_error = counter.ReportErrors(report_level, boosting_mode, fontinfo_table, *it, unichar_error, fonts_report); @@ -95,8 +97,9 @@ double ErrorCounter::ComputeErrorRate(ShapeClassifier *classifier, int report_le } if (report_level > 1 && total_samples > 0) { // It is useful to know the time in microseconds/char. - tprintDebug("Errors computed in {} sec at {} μs/char\n", total_time, - 1000000.0 * total_time / total_samples); + auto total_time = clock.get_elapsed_ms(); + tprintDebug("Errors computed in {} ms at {} μs/char\n", total_time, + 1000.0 * total_time / total_samples); } return unscaled_error; } @@ -120,7 +123,7 @@ void ErrorCounter::DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifi // Iterate over all the samples, accumulating errors. for (it->Begin(); !it->AtEnd(); it->Next()) { TrainingSample *mutable_sample = it->MutableSample(); - int page_index = mutable_sample->page_num(); + size_t page_index = mutable_sample->page_num(); Image page_pix = 0 <= page_index && page_index < page_images.size() ? page_images[page_index] : nullptr; new_classifier->SetPageImageForDebugReport(page_pix); @@ -409,7 +412,7 @@ double ErrorCounter::ReportErrors(int report_level, CountTypes boosting_mode, } } tprintDebug("Multi-unichar shape use:\n"); - for (int u = 0; u < multi_unichar_counts_.size(); ++u) { + for (size_t u = 0; u < multi_unichar_counts_.size(); ++u) { if (multi_unichar_counts_[u] > 0) { tprintDebug("{} multiple answers for unichar: {}\n", multi_unichar_counts_[u], unicharset_.id_to_unichar(u)); diff --git a/src/training/common/trainingsampleset.cpp b/src/training/common/trainingsampleset.cpp index f396d6c3e7..5a6927fa50 100644 --- a/src/training/common/trainingsampleset.cpp +++ b/src/training/common/trainingsampleset.cpp @@ -26,6 +26,7 @@ #include "intfeaturemap.h" #include "intfeaturespace.h" #include "shapetable.h" +#include "tesserrstream.h" // for tesserr #include "trainingsample.h" #include "trainingsampleset.h" #include "unicity_table.h" @@ -589,7 +590,7 @@ void TrainingSampleSet::SetupFontIdMap() { // Number of samples for each font_id. std::vector font_counts; for (auto &sample : samples_) { - const int font_id = sample->font_id(); + const size_t font_id = sample->font_id(); while (font_id >= font_counts.size()) { font_counts.push_back(0); } diff --git a/src/training/lstmeval.cpp b/src/training/lstmeval.cpp index 201accfc0f..37fcad199f 100644 --- a/src/training/lstmeval.cpp +++ b/src/training/lstmeval.cpp @@ -32,7 +32,7 @@ STRING_VAR(lstmeval_traineddata, "", "be the traineddata file that was given to the trainer"); STRING_VAR(lstmeval_eval_listfile, "", "File listing sample files in lstmf training format."); INT_VAR(lstmeval_max_image_MB, 2000, "Max memory to use for images."); -INT_VAR(lstmeval_verbosity, 1, "Amount of diagnosting information to output (0-2)."); +INT_VAR(lstmeval_verbosity, 1, "Amount of diagnostics information to output (0-2)."); FZ_HEAPDBG_TRACKER_SECTION_END_MARKER(_) @@ -45,7 +45,9 @@ extern "C" int tesseract_lstm_eval_main(int argc, const char** argv) tesseract::CheckSharedLibraryVersion(); (void)tesseract::SetConsoleModeToUTF8(); - int rv = ParseArguments(&argc, &argv); + tesseract::TessBaseAPI api; + + int rv = ParseArguments(api, &argc, &argv); if (rv >= 0) { return rv; } diff --git a/src/training/lstmtraining.cpp b/src/training/lstmtraining.cpp index 378cdb9f7d..9e2c0c8de7 100644 --- a/src/training/lstmtraining.cpp +++ b/src/training/lstmtraining.cpp @@ -95,7 +95,9 @@ extern "C" int tesseract_lstm_training_main(int argc, const char** argv) tesseract::CheckSharedLibraryVersion(); (void)tesseract::SetConsoleModeToUTF8(); - int rv = ParseArguments(&argc, &argv); + tesseract::TessBaseAPI api; + + int rv = ParseArguments(api, &argc, &argv); if (rv >= 0) { return rv; } @@ -147,7 +149,7 @@ extern "C" int tesseract_lstm_training_main(int argc, const char** argv) std::string checkpoint_file = training_model_output; checkpoint_file += "_checkpoint"; std::string checkpoint_bak = checkpoint_file + ".bak"; - tesseract::LSTMTrainer trainer(training_model_output, checkpoint_file, + tesseract::LSTMTrainer trainer(api.tesseract(), training_model_output, checkpoint_file, training_debug_interval, static_cast(training_max_image_MB) * 1048576); #if !defined(NDEEBUG) diff --git a/src/training/mftraining.cpp b/src/training/mftraining.cpp index 12fdadbcf2..ec6cc4dd22 100644 --- a/src/training/mftraining.cpp +++ b/src/training/mftraining.cpp @@ -202,7 +202,9 @@ extern "C" TESS_API int tesseract_mf_training_main(int argc, const char** argv) tesseract::CheckSharedLibraryVersion(); (void)tesseract::SetConsoleModeToUTF8(); - int rv = ParseArguments(&argc, &argv); + tesseract::TessBaseAPI api; + + int rv = ParseArguments(api, &argc, &argv); if (rv >= 0) { return rv; } diff --git a/src/training/pango/boxchar.cpp b/src/training/pango/boxchar.cpp index d53017c555..71310580fc 100644 --- a/src/training/pango/boxchar.cpp +++ b/src/training/pango/boxchar.cpp @@ -25,6 +25,7 @@ #include "../unicharset/fileio.h" #include "../unicharset/normstrngs.h" +#include "tesserrstream.h" // for tesserr #include #include "unicharset.h" diff --git a/src/training/pango/pango_font_info.cpp b/src/training/pango/pango_font_info.cpp index aef7adce68..203ad45bf0 100644 --- a/src/training/pango/pango_font_info.cpp +++ b/src/training/pango/pango_font_info.cpp @@ -234,7 +234,7 @@ bool PangoFontInfo::CoversUTF8Text(const char *utf8_text, int byte_length) const int len = it.get_utf8(tmp); tmp[len] = '\0'; tprintInfo("'{}' (U+{}) not covered by font.\n", tmp, *it); -#if PANGO_VERSION_CHECK(1, 52, 0) +#if PANGO_VERSION_CHECK(1, 50, 4) g_object_unref(coverage); #else pango_coverage_unref(coverage); @@ -243,7 +243,7 @@ bool PangoFontInfo::CoversUTF8Text(const char *utf8_text, int byte_length) const return false; } } -#if PANGO_VERSION_CHECK(1, 52, 0) +#if PANGO_VERSION_CHECK(1, 50, 4) g_object_unref(coverage); #else pango_coverage_unref(coverage); @@ -315,7 +315,7 @@ int PangoFontInfo::DropUncoveredChars(std::string *utf8_text) const { my_strnmove(out, utf8_char, utf8_len); out += utf8_len; } -#if PANGO_VERSION_CHECK(1, 52, 0) +#if PANGO_VERSION_CHECK(1, 50, 4) g_object_unref(coverage); #else pango_coverage_unref(coverage); @@ -526,9 +526,9 @@ bool FontUtils::IsAvailableFont(const char *input_query_desc, std::string *best_ *best_match = selected_desc_str; // Clip the ending ' 0' if there is one. It seems that, if there is no // point size on the end of the fontname, then Pango always appends ' 0'. - int len = best_match->size(); + auto len = best_match->size(); if (len > 2 && best_match->at(len - 1) == '0' && best_match->at(len - 2) == ' ') { - *best_match = best_match->substr(0, len - 2); + best_match->resize(len - 2); } } g_free((void *)selected_desc_str); @@ -620,7 +620,7 @@ int FontUtils::FontScore(const std::unordered_map &ch_map, ch_flags->push_back(covered); } } -#if PANGO_VERSION_CHECK(1, 52, 0) +#if PANGO_VERSION_CHECK(1, 50, 4) g_object_unref(coverage); #else pango_coverage_unref(coverage); diff --git a/src/training/shapeclustering.cpp b/src/training/shapeclustering.cpp index 77d0b886e2..83cc6e0efa 100644 --- a/src/training/shapeclustering.cpp +++ b/src/training/shapeclustering.cpp @@ -56,7 +56,9 @@ extern "C" TESS_API int tesseract_shape_clustering_main(int argc, const char** a tesseract::CheckSharedLibraryVersion(); (void)tesseract::SetConsoleModeToUTF8(); - int rv = ParseArguments(&argc, &argv); + tesseract::TessBaseAPI api; + + int rv = ParseArguments(api, &argc, &argv); if (rv >= 0) { return EXIT_FAILURE; } diff --git a/src/training/unicharset/lstmtester.cpp b/src/training/unicharset/lstmtester.cpp index c2fa3900a4..a16c1cd280 100644 --- a/src/training/unicharset/lstmtester.cpp +++ b/src/training/unicharset/lstmtester.cpp @@ -82,7 +82,8 @@ std::string LSTMTester::RunEvalAsync(int iteration, const double *training_error std::string LSTMTester::RunEvalSync(int iteration, const double *training_errors, const TessdataManager &model_mgr, int training_stage, int verbosity) { - LSTMTrainer trainer; + TessBaseAPI api; + LSTMTrainer trainer(api.tesseract()); trainer.SetDebug(HasDebug()); trainer.InitCharSet(model_mgr); TFile fp; diff --git a/src/training/unicharset/lstmtrainer.cpp b/src/training/unicharset/lstmtrainer.cpp index 4aa8180104..0e5008d606 100644 --- a/src/training/unicharset/lstmtrainer.cpp +++ b/src/training/unicharset/lstmtrainer.cpp @@ -32,9 +32,6 @@ #include "../common/networkbuilder.h" #include "ratngs.h" #include "recodebeam.h" -#ifdef INCLUDE_TENSORFLOW -# include "tfnetwork.h" -#endif #include namespace tesseract { @@ -70,8 +67,8 @@ const int kTargetXScale = 5; const int kTargetYScale = 100; #endif // !GRAPHICS_DISABLED -LSTMTrainer::LSTMTrainer() - : LSTMRecognizer(nullptr) +LSTMTrainer::LSTMTrainer(Tesseract &tess) + : LSTMRecognizer(tess) , randomly_rotate_(false) , training_data_(0) , sub_trainer_(nullptr) @@ -80,9 +77,9 @@ LSTMTrainer::LSTMTrainer() debug_interval_ = 0; } -LSTMTrainer::LSTMTrainer(const std::string &model_base, const std::string &checkpoint_name, +LSTMTrainer::LSTMTrainer(Tesseract &tess, const std::string &model_base, const std::string &checkpoint_name, int debug_interval, int64_t max_memory) - : LSTMRecognizer(nullptr) + : LSTMRecognizer(tess) , randomly_rotate_(false) , training_data_(max_memory) , sub_trainer_(nullptr) { @@ -181,23 +178,6 @@ bool LSTMTrainer::InitNetwork(const char *network_spec, int append_index, return true; } -// Initializes a trainer from a serialized TFNetworkModel proto. -// Returns the global step of TensorFlow graph or 0 if failed. -#ifdef INCLUDE_TENSORFLOW -int LSTMTrainer::InitTensorFlowNetwork(const std::string &tf_proto) { - delete network_; - TFNetwork *tf_net = new TFNetwork("TensorFlow"); - training_iteration_ = tf_net->InitFromProtoStr(tf_proto); - if (training_iteration_ == 0) { - tprintError("InitFromProtoStr failed!!\n"); - return 0; - } - network_ = tf_net; - ASSERT_HOST(recoder_.code_range() == tf_net->num_classes()); - return training_iteration_; -} -#endif - // Resets all the iteration counters for fine tuning or traininng a head, // where we want the error reporting to reset. void LSTMTrainer::InitIterations() { @@ -581,7 +561,7 @@ bool LSTMTrainer::DeSerialize(const TessdataManager *mgr, TFile *fp) { if (sub_data.empty()) { sub_trainer_ = nullptr; } else { - sub_trainer_ = std::make_unique(); + sub_trainer_ = std::make_unique(tesseract_); if (!ReadTrainingDump(sub_data, *sub_trainer_)) { return false; } @@ -599,7 +579,7 @@ bool LSTMTrainer::DeSerialize(const TessdataManager *mgr, TFile *fp) { // learning rates (by scaling reduction, or layer specific, according to // NF_LAYER_SPECIFIC_LR). void LSTMTrainer::StartSubtrainer(std::stringstream &log_msg) { - sub_trainer_ = std::make_unique(); + sub_trainer_ = std::make_unique(tesseract_); if (!ReadTrainingDump(best_trainer_, *sub_trainer_)) { log_msg << " Failed to revert to previous best for trial!"; sub_trainer_.reset(); @@ -720,7 +700,7 @@ int LSTMTrainer::ReduceLayerLearningRates(TFloat factor, int num_samples, ww_factor *= factor; } // Make a copy of *this, so we can mess about without damaging anything. - LSTMTrainer copy_trainer; + LSTMTrainer copy_trainer(tesseract_); copy_trainer.SetDebug(samples_trainer->HasDebug()); samples_trainer->ReadTrainingDump(orig_trainer, copy_trainer); // Clear the updates, doing nothing else. @@ -747,7 +727,7 @@ int LSTMTrainer::ReduceLayerLearningRates(TFloat factor, int num_samples, if (num_weights[i] == 0) { continue; } - LSTMTrainer layer_trainer; + LSTMTrainer layer_trainer(tesseract_); layer_trainer.SetDebug(samples_trainer->HasDebug()); samples_trainer->ReadTrainingDump(updated_trainer, layer_trainer); Network *layer = layer_trainer.GetLayer(layers[i]); @@ -926,6 +906,8 @@ Trainability LSTMTrainer::PrepareForBackward(const ImageData *trainingdata, // Apart from space and null, increment the label. This changes the // script-id to the same script-id but upside-down. // The labels need to be reversed in order, as the first is now the last. + // + // TODO: possibly wrong code, check. for (auto truth_label : truth_labels) { if (truth_label != UNICHAR_SPACE && truth_label != null_char_) { ++truth_label; diff --git a/src/training/unicharset/lstmtrainer.h b/src/training/unicharset/lstmtrainer.h index 5d5dab42f4..47555c0452 100644 --- a/src/training/unicharset/lstmtrainer.h +++ b/src/training/unicharset/lstmtrainer.h @@ -86,8 +86,9 @@ using TestCallback = std::function(ch)); const icu::Transliterator *fulltohalf = - icu::Transliterator::createInstance("Fullwidth-Halfwidth", UTRANS_FORWARD, error_code); + icu::Transliterator::createInstance(icu::UnicodeString("Fullwidth-Halfwidth"), UTRANS_FORWARD, error_code); error_code.assertSuccess(); error_code.reset(); diff --git a/src/training/unicharset/unicharset_training_utils.cpp b/src/training/unicharset/unicharset_training_utils.cpp index c765d22630..9973ad3ab6 100644 --- a/src/training/unicharset/unicharset_training_utils.cpp +++ b/src/training/unicharset/unicharset_training_utils.cpp @@ -2,7 +2,6 @@ // File: unicharset_training_utils.cpp // Description: Training utilities for UNICHARSET. // Author: Ray Smith -// Created: Fri Oct 17 17:09:01 PDT 2014 // // (C) Copyright 2014, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); @@ -31,6 +30,7 @@ #include "icuerrorcode.h" #include "normstrngs.h" #include "statistc.h" +#include "tesserrstream.h" // for tesserr #include "unicharset.h" #if defined(HAS_LIBICU) @@ -159,7 +159,7 @@ void SetScriptProperties(const std::string &script_dir, UNICHARSET *unicharset) tprintError("Failed to load script unicharset from:{}\n", filename); } } - for (int c = SPECIAL_UNICHAR_CODES_COUNT; c < unicharset->size(); ++c) { + for (size_t c = SPECIAL_UNICHAR_CODES_COUNT; c < unicharset->size(); ++c) { if (unicharset->PropertiesIncomplete(c)) { tprintWarn("Properties incomplete for index {} = {}\n", c, unicharset->id_to_unichar(c)); diff --git a/src/viewer/scrollview.cpp b/src/viewer/scrollview.cpp index 0e08711595..fc747a812c 100644 --- a/src/viewer/scrollview.cpp +++ b/src/viewer/scrollview.cpp @@ -368,7 +368,7 @@ void InteractiveScrollView::Initialize(Tesseract *tess, const char *name, // Set up an actual Window on the client side. char message[kMaxMsgSize]; snprintf(message, sizeof(message), - "w%u = luajava.newInstance('com.google.scrollview.ui" + "w%d = luajava.newInstance('com.google.scrollview.ui" ".SVWindow','%s',%u,%u,%u,%u,%u,%u,%u)\n", window_id_, window_name_, window_id_, x_pos, y_pos, x_size, y_size, x_canvas_size, y_canvas_size); @@ -466,7 +466,7 @@ void InteractiveScrollView::vSendMsg(fmt::string_view format, fmt::format_args a } char winidstr[kMaxIntPairSize]; - snprintf(winidstr, kMaxIntPairSize, "w%u:", window_id_); + snprintf(winidstr, kMaxIntPairSize, "w%d:", window_id_); std::string form(winidstr); form += message; stream_->Send(form.c_str()); @@ -613,7 +613,7 @@ void InteractiveScrollView::vAddMessage(fmt::string_view format, fmt::format_arg auto message = fmt::vformat(format, args); char winidstr[kMaxIntPairSize]; - snprintf(winidstr, kMaxIntPairSize, "w%u:", window_id_); + snprintf(winidstr, kMaxIntPairSize, "w%d:", window_id_); std::string form(winidstr); form += message; diff --git a/src/viewer/svutil.cpp b/src/viewer/svutil.cpp index 88eb84760a..7ad7f857e7 100644 --- a/src/viewer/svutil.cpp +++ b/src/viewer/svutil.cpp @@ -266,8 +266,6 @@ SVNetwork::SVNetwork(const char *hostname, int port) { buffer_ptr_ = nullptr; - struct addrinfo *addr_info = nullptr; - struct addrinfo hints = {0, PF_INET, SOCK_STREAM}; auto port_string = std::to_string(port); # if defined(WIN32) || defined(_WIN32) || defined(_WIN64) // Initialize Winsock @@ -278,6 +276,10 @@ SVNetwork::SVNetwork(const char *hostname, int port) { } # endif // _WIN32 + struct addrinfo *addr_info = nullptr; + struct addrinfo hints = {}; + hints.ai_family = AF_INET; + hints.ai_socktype = SOCK_STREAM; if (getaddrinfo(hostname, port_string.c_str(), &hints, &addr_info) != 0) { std::cerr << "Error resolving name for ScrollView host " << std::string(hostname) << ":" << port << std::endl; diff --git a/src/wordrec/associate.h b/src/wordrec/associate.h index 910ef9a4dd..36cca86496 100644 --- a/src/wordrec/associate.h +++ b/src/wordrec/associate.h @@ -55,13 +55,13 @@ struct AssociateStats { float shape_cost; // cost of blob shape bool bad_shape; // true if the shape of the blob is unacceptable - float full_wh_ratio; // width-to-hight ratio + gap on the right - float full_wh_ratio_total; // sum of width-to-hight ratios + float full_wh_ratio; // width-to-height ratio + gap on the right + float full_wh_ratio_total; // sum of width-to-height ratios // on the path terminating at this blob float full_wh_ratio_var; // variance of full_wh_ratios on the path bool bad_fixed_pitch_right_gap; // true if there is no gap before // the blob on the right - bool bad_fixed_pitch_wh_ratio; // true if the blobs has width-to-hight + bool bad_fixed_pitch_wh_ratio; // true if the blobs has width-to-height // ratio > kMaxFixedPitchCharAspectRatio int gap_sum; // sum of gaps within the blob }; diff --git a/src/wordrec/language_model.h b/src/wordrec/language_model.h index e0906cd4d4..70ff7fcea2 100644 --- a/src/wordrec/language_model.h +++ b/src/wordrec/language_model.h @@ -276,7 +276,7 @@ class LanguageModel : public LanguageModelSettings { // (used by ComputeNgramCost()). float ComputeDenom(BLOB_CHOICE_LIST *curr_list); - // Fills the given consistenty_info based on parent_vse.consistency_info + // Fills the given consistency_info based on parent_vse.consistency_info // and on the consistency of the given unichar_id with parent_vse. void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, diff --git a/src/wordrec/lm_state.h b/src/wordrec/lm_state.h index 35e39d11d2..aa760361ea 100644 --- a/src/wordrec/lm_state.h +++ b/src/wordrec/lm_state.h @@ -89,7 +89,7 @@ struct LanguageModelNgramInfo { /// Struct for storing the information about a path in the segmentation graph /// explored by Viterbi search. -struct ViterbiStateEntry : public ELIST_LINK { +struct ViterbiStateEntry : public ELIST::LINK { ViterbiStateEntry(ViterbiStateEntry *pe, BLOB_CHOICE *b, float c, float ol, const LMConsistencyInfo &ci, const AssociateStats &as, LanguageModelFlagsType tcf, LanguageModelDawgInfo *d, LanguageModelNgramInfo *n, @@ -133,9 +133,7 @@ struct ViterbiStateEntry : public ELIST_LINK { } /// Comparator function for sorting ViterbiStateEntry_LISTs in /// non-increasing order of costs. - static int Compare(const void *e1, const void *e2) { - const ViterbiStateEntry *ve1 = *static_cast(e1); - const ViterbiStateEntry *ve2 = *static_cast(e2); + static int Compare(const ViterbiStateEntry *ve1, const ViterbiStateEntry *ve2) { return (ve1->cost < ve2->cost) ? -1 : 1; } inline bool Consistent() const { diff --git a/src/wordrec/wordrec.cpp b/src/wordrec/wordrec.cpp index 2b196f466a..63485c1e5e 100644 --- a/src/wordrec/wordrec.cpp +++ b/src/wordrec/wordrec.cpp @@ -101,11 +101,11 @@ Wordrec::Wordrec() "Save alternative paths found during chopping" " and segmentation search", params()) + , language_model_(this, std::make_unique(&get_fontinfo_table(), &(getDict()))) , pass2_ok_split_(0.0f) , language_model_(this, &get_fontinfo_table(), &getDict()) -{ - prev_word_best_choice_ = nullptr; - fill_lattice_ = nullptr; + , prev_word_best_choice_(nullptr) + , fill_lattice_(nullptr) { } } // namespace tesseract diff --git a/src/wordrec/wordrec.h b/src/wordrec/wordrec.h index 87aeedfbc4..2caf2ace1c 100644 --- a/src/wordrec/wordrec.h +++ b/src/wordrec/wordrec.h @@ -167,7 +167,7 @@ class SegSearchPending { }; /* ccmain/tstruct.cpp *********************************************************/ -class FRAGMENT : public ELIST_LINK { +class FRAGMENT : public ELIST::LINK { public: FRAGMENT() { // constructor } diff --git a/sw.cpp b/sw.cpp index 632e03e4d7..7af0ff07b5 100644 --- a/sw.cpp +++ b/sw.cpp @@ -16,7 +16,6 @@ void build(Solution &s) libtesseract += "TESS_API"_api; libtesseract += "include/.*"_rr; libtesseract += "src/.+/.*"_rr; - libtesseract -= "src/lstm/.*\\.cc"_rr; libtesseract -= "src/training/.*"_rr; libtesseract.Public += "include"_idir; @@ -335,8 +334,7 @@ void build(Solution &s) auto &tw = add_test("tatweel"); tw += "unittest/util/.*"_rr; - tw += "unittest/third_party/.*"_rr; - tw -= "unittest/third_party/googletest/.*"_rr; + tw += "unittest/third_party/utf/.*"_rr; } } diff --git a/tesseract.pc.cmake b/tesseract.pc.cmake index 6bd0214eed..96b9d968b0 100644 --- a/tesseract.pc.cmake +++ b/tesseract.pc.cmake @@ -8,10 +8,10 @@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ # Package Information Name: @tesseract_NAME@ -Description: An OCR Engine developed at HP Labs between 1985 and 1995, now maintained by Google. +Description: An OCR Engine that was developed at HP Labs (1985-1995) and Google (2006-2018). URL: https://github.com/tesseract-ocr/tesseract Version: @tesseract_VERSION@ Requires.private: lept -Libs: -L${libdir} -l@tesseract_OUTPUT_NAME@ @libarchive_LIBS@ @libcurl_LIBS@ @TENSORFLOW_LIBS@ +Libs: -L${libdir} -l@tesseract_OUTPUT_NAME@ @libarchive_LIBS@ @libcurl_LIBS@ Libs.private: Cflags: -I${includedir} diff --git a/tesseract.pc.in b/tesseract.pc.in index 88d1684940..4d8dc29667 100644 --- a/tesseract.pc.in +++ b/tesseract.pc.in @@ -11,10 +11,10 @@ includedir = @includedir@ # Package Information Name: @PACKAGE_NAME@ -Description: An OCR Engine developed at HP Labs between 1985 and 1995, now maintained by Google. +Description: An OCR Engine that was developed at HP Labs (1985-1995) and Google (2006-2018). URL: https://github.com/tesseract-ocr/tesseract Version: @VERSION@ Requires.private: lept -Libs: -L${libdir} -ltesseract @libarchive_LIBS@ @libcurl_LIBS@ @TENSORFLOW_LIBS@ +Libs: -L${libdir} -ltesseract @libarchive_LIBS@ @libcurl_LIBS@ Libs.private: -lpthread Cflags: -I${includedir} diff --git a/test b/test index c9d342e2c6..232ff181c6 160000 --- a/test +++ b/test @@ -1 +1 @@ -Subproject commit c9d342e2c6212d50da2f21efb6bd4b2c4545f773 +Subproject commit 232ff181c66516116ec0e84c4963f70de15050fd diff --git a/unittest/CMakeLists.txt b/unittest/CMakeLists.txt new file mode 100644 index 0000000000..6a91c62fc2 --- /dev/null +++ b/unittest/CMakeLists.txt @@ -0,0 +1,110 @@ +# find_package(GTest REQUIRED) +include(GoogleTest) # Todo install GoogleTests? + +# Set common include directories +set(COMMON_INCLUDE_DIRS + ${CMAKE_CURRENT_BINARY_DIR}/../src/training + ${CMAKE_CURRENT_SOURCE_DIR}/../src/ccutil + ${CMAKE_CURRENT_SOURCE_DIR}/../src/ccstruct + ${CMAKE_CURRENT_SOURCE_DIR}/../src/viewer + ${CMAKE_CURRENT_SOURCE_DIR}/../include + ${CMAKE_CURRENT_SOURCE_DIR}/../src/training/unicharset + ${CMAKE_CURRENT_SOURCE_DIR}/../src/training/common + ${CMAKE_CURRENT_SOURCE_DIR}/third_party/googletest/googlemock/include) + +if (MSVC) + set(TESSBIN_DIR ${EXECUTABLE_OUTPUT_PATH}/$) +else() + set(TESSBIN_DIR ${EXECUTABLE_OUTPUT_PATH}) +endif() + +# Set common compile definitions +set(COMMON_COMPILE_DEFINITIONS + "-DTESTING_DIR=\"${CMAKE_CURRENT_SOURCE_DIR}/../test/testing\"" + "-DTESSDATA_DIR=\"${CMAKE_CURRENT_SOURCE_DIR}/../tessdata\"" + "-DTESSBIN_DIR=\"${TESSBIN_DIR}\"" + "-DTESTDATA_DIR=\"${CMAKE_CURRENT_SOURCE_DIR}/../test/testdata\"" + "-DLANGDATA_DIR=\"${CMAKE_CURRENT_SOURCE_DIR}/../langdata_lstm\"") + +file( + GLOB TEST_SOURCES + RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} + "*.cc") + +set(COMMON_LINK_LIBS libtesseract GTest::gtest_main common_training + unicharset_training) + +set(TRAINING_TESTS + commandlineflags_test.cc + dawg_test.cc + lstm_recode_test.cc + lstm_squashed_test.cc + lstm_test.cc + lstm_test.cc + normstrngs_test.cc + unichar_test.cc + unicharcompress_test.cc + unicharset_test.cc + validate_grapheme_test.cc + validate_indic_test.cc + validate_khmer_test.cc + validate_myanmar_test.cc + validator_test.cc) + +set(PANGO_TESTS ligature_table_test.cc pango_font_info_test.cc + pango_font_info_test.cc stringrenderer_test.cc) + +set(LEGACY_TESTS + applybox_test.cc + bitvector_test.cc + equationdetect_test.cc + indexmapbidi_test.cc + intfeaturemap_test.cc + mastertrainer_test.cc + osd_test.cc + params_model_test.cc + shapetable_test.cc) + +if(BUILD_TRAINING_TOOLS AND PANGO_FOUND) + list(APPEND COMMON_INCLUDE_DIRS + ${CMAKE_CURRENT_SOURCE_DIR}/../src/training/pango ${PANGO_INCLUDE_DIRS}) + +else() + list(REMOVE_ITEM TEST_SOURCES ${PANGO_TESTS}) +endif() + +if(DISABLED_LEGACY_ENGINE) + list(REMOVE_ITEM TEST_SOURCES ${LEGACY_TESTS}) +endif() + +if(NOT BUILD_TRAINING_TOOLS) + list(REMOVE_ITEM TEST_SOURCES ${TRAINING_TESTS}) +endif() + +set(TATWEEL_TEST_EXTRA_SRC util/utf8/unilib.cc util/utf8/unicodetext.cc + third_party/utf/rune.c) + +message(STATUS "Enabled tests: ${TEST_SOURCES}") + +foreach(test_source IN LISTS TEST_SOURCES) + get_filename_component(test_name ${test_source} NAME_WE) + if(${test_source} IN_LIST PANGO_TESTS) + list(APPEND COMMON_LINK_LIBS pango_training ${PANGO_LIBRARIES}) + endif() + if(${test_name} MATCHES "tatweel_test") + list(APPEND test_source ${TATWEEL_TEST_EXTRA_SRC}) + list(APPEND COMMON_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/util/utf8) + endif() + add_executable(${test_name} ${test_source}) + if(${test_name} MATCHES "progress_test") + target_link_libraries(${test_name} PRIVATE GTest::gmock) + endif() + target_compile_definitions(${test_name} PRIVATE ${COMMON_COMPILE_DEFINITIONS}) + target_include_directories(${test_name} PRIVATE ${COMMON_INCLUDE_DIRS}) + target_link_libraries(${test_name} PRIVATE ${COMMON_LINK_LIBS}) + add_test(NAME ${test_name} COMMAND ${test_name}) +endforeach() + +# Discover tests gtest_discover_tests(apiexample_test baseapi_test +# baseapi_thread_test) add_test(baseapi_gtests baseapi_test.cc) diff --git a/unittest/README.md b/unittest/README.md index 2d7742993a..113e82831d 100644 --- a/unittest/README.md +++ b/unittest/README.md @@ -82,6 +82,9 @@ To run the tests, do the following in tesseract folder ``` autoreconf -fiv git submodule update --init +git clone https://github.com/egorpugin/tessdata tessdata_unittest --depth 1 +cp tessdata_unittest/fonts/* test/testing/ +mv tessdata_unittest/* ../ export TESSDATA_PREFIX=/prefix/to/path/to/tessdata make check ``` diff --git a/unittest/baseapi_test.cc b/unittest/baseapi_test.cc index 76eebe3dcf..b42b1381b4 100644 --- a/unittest/baseapi_test.cc +++ b/unittest/baseapi_test.cc @@ -261,7 +261,7 @@ TEST_F(TesseractTest, LSTMGeometryTest) { tess_blob_box.rotate(block->re_rotation()); // verify that each of LSTM's character boxes lies close to within // tesseract's word box - for (int i = 0; i < word->box_word->length(); ++i) { + for (size_t i = 0; i < word->box_word->length(); ++i) { TBOX lstm_blob_box = word->box_word->BlobBox(i); // LSTM character box should not spill out of tesseract word box // by more than a few pixels in any direction diff --git a/unittest/fuzzers/oss-fuzz-build.sh b/unittest/fuzzers/oss-fuzz-build.sh index 5c7a37bb49..491e19ee00 100755 --- a/unittest/fuzzers/oss-fuzz-build.sh +++ b/unittest/fuzzers/oss-fuzz-build.sh @@ -32,7 +32,7 @@ mkdir -p "$OUT"/tessdata ( cd "$OUT"/tessdata test -f eng.traineddata || \ - curl -L -O https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata + curl -sSL -O https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata ) # OSS-Fuzz requires static linking for the project specific libraries, diff --git a/unittest/intsimdmatrix_test.cc b/unittest/intsimdmatrix_test.cc index f0e9866e6f..f365b8e3c6 100644 --- a/unittest/intsimdmatrix_test.cc +++ b/unittest/intsimdmatrix_test.cc @@ -93,9 +93,9 @@ class IntSimdMatrixTest : public ::testing::Test { } // Compare sum of all results with expected value. #ifdef FAST_FLOAT - EXPECT_FLOAT_EQ(total, 337852.16f); + EXPECT_FLOAT_EQ(total, -423236.53f); #else - EXPECT_FLOAT_EQ(total, 337849.39354684710); + EXPECT_FLOAT_EQ(total, -423243.392011); #endif } diff --git a/unittest/linlsq_test.cc b/unittest/linlsq_test.cc index cac5dba24e..e278c9f999 100644 --- a/unittest/linlsq_test.cc +++ b/unittest/linlsq_test.cc @@ -103,15 +103,15 @@ TEST_F(LLSQTest, Vectors) { // sqrt( sum (!nvec * (x_i - x_avg))^2 / n) TEST_F(LLSQTest, RmsOrthWorksAsIntended) { std::vector pts; - pts.emplace_back(0.56, 0.95); - pts.emplace_back(0.09, 0.09); - pts.emplace_back(0.13, 0.77); - pts.emplace_back(0.16, 0.83); - pts.emplace_back(0.45, 0.79); - VerifyRmsOrth(pts, FCOORD(1, 0)); - VerifyRmsOrth(pts, FCOORD(1, 1)); - VerifyRmsOrth(pts, FCOORD(1, 2)); - VerifyRmsOrth(pts, FCOORD(2, 1)); + pts.emplace_back(0.56f, 0.95f); + pts.emplace_back(0.09f, 0.09f); + pts.emplace_back(0.13f, 0.77f); + pts.emplace_back(0.16f, 0.83f); + pts.emplace_back(0.45f, 0.79f); + VerifyRmsOrth(pts, FCOORD(1.f, 0.f)); + VerifyRmsOrth(pts, FCOORD(1.f, 1.f)); + VerifyRmsOrth(pts, FCOORD(1.f, 2.f)); + VerifyRmsOrth(pts, FCOORD(2.f, 1.f)); } } // namespace tesseract diff --git a/unittest/list_test.cc b/unittest/list_test.cc index 70afd22a46..990d17b8f1 100644 --- a/unittest/list_test.cc +++ b/unittest/list_test.cc @@ -25,19 +25,19 @@ class ListTest : public ::testing::Test { const size_t ListSize = 5; }; -class Clst : public CLIST_LINK { +class Clst { public: Clst(unsigned n) : value(n) {} unsigned value; }; -class Elst : public ELIST_LINK { +class Elst : public ELIST::LINK { public: Elst(unsigned n) : value(n) {} unsigned value; }; -class Elst2 : public ELIST2_LINK { +class Elst2 : public ELIST2::LINK { public: Elst2(unsigned n) : value(n) {} unsigned value; @@ -51,7 +51,7 @@ TEST_F(ListTest, TestCLIST) { Clst_CLIST list; EXPECT_TRUE(list.empty()); EXPECT_EQ(list.length(), 0); - auto it = CLIST_ITERATOR(&list); + auto it = Clst_CLIST::ITERATOR(&list); for (unsigned i = 0; i < ListSize; i++) { auto *lst = new Clst(i); it.add_to_end(lst); @@ -82,7 +82,7 @@ TEST_F(ListTest, TestELIST) { Elst_LIST list; EXPECT_TRUE(list.empty()); EXPECT_EQ(list.length(), 0); - auto it = ELIST_ITERATOR(&list); + auto it = ELIST::ITERATOR(&list); for (unsigned i = 0; i < ListSize; i++) { auto *elst = new Elst(i); it.add_to_end(elst); @@ -113,7 +113,7 @@ TEST_F(ListTest, TestELIST2) { Elst2_LIST list; EXPECT_TRUE(list.empty()); EXPECT_EQ(list.length(), 0); - auto it = ELIST2_ITERATOR(&list); + auto it = ELIST2::ITERATOR(&list); for (unsigned i = 0; i < ListSize; i++) { auto *lst = new Elst2(i); it.add_to_end(lst); diff --git a/unittest/lstm_test.cc b/unittest/lstm_test.cc index 4b3d4ac271..56ff0f900b 100644 --- a/unittest/lstm_test.cc +++ b/unittest/lstm_test.cc @@ -1,25 +1,26 @@ -// (C) Copyright 2017, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Generating the training data: -// If the format of the lstmf (ImageData) file changes, the training data will -// have to be regenerated as follows: -// -// Use --xsize 800 for text2image to be similar to original training data. -// -// tesstrain.py --fonts_dir /usr/share/fonts --lang eng \ -// --linedata_only --noextract_font_properties --langdata_dir ../langdata_lstm \ -// --tessdata_dir ../tessdata --output_dir ~/tesseract/test/testdata \ -// --fontlist "Arial" --maxpages 10 -// +/* + * (C) Copyright 2017, Google Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http: *www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + + * Generating the training data: + * If the format of the lstmf (ImageData) file changes, the training data will + * have to be regenerated as follows: + * + * Use --xsize 800 for text2image to be similar to original training data. + * + * tesstrain.py --fonts_dir /usr/share/fonts --lang eng \ + * --linedata_only --noextract_font_properties --langdata_dir ../langdata_lstm \ + * --tessdata_dir ../tessdata --output_dir ~/tesseract/test/testdata \ + * --fontlist "Arial" --maxpages 10 + */ #include "lstm_test.h" diff --git a/unittest/mastertrainer_test.cc b/unittest/mastertrainer_test.cc index 040ad56075..2845df4a82 100644 --- a/unittest/mastertrainer_test.cc +++ b/unittest/mastertrainer_test.cc @@ -160,9 +160,9 @@ class MasterTrainerTest : public testing::Test { return file::JoinPath(FLAGS_test_tmpdir, name); } - MasterTrainerTest() { - shape_table_ = nullptr; - master_trainer_ = nullptr; + MasterTrainerTest() : + shape_table_(nullptr), + master_trainer_(nullptr) { } ~MasterTrainerTest() override { delete shape_table_; diff --git a/unittest/pagesegmode_test.cc b/unittest/pagesegmode_test.cc index 0bf473e65c..8997f7731d 100644 --- a/unittest/pagesegmode_test.cc +++ b/unittest/pagesegmode_test.cc @@ -9,14 +9,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(_WIN32) -# include // for _access -#else -# include // for access -#endif #include #include #include +#include #include #include "helpers.h" #include "include_gunit.h" @@ -26,15 +22,6 @@ namespace tesseract { -// Replacement for std::filesystem::exists (C++-17) -static bool file_exists(const char *filename) { -#if defined(_WIN32) - return _access(filename, 0) == 0; -#else - return access(filename, 0) == 0; -#endif -} - // The fixture for testing Tesseract. class PageSegModeTest : public testing::Test { protected: @@ -88,7 +75,7 @@ class PageSegModeTest : public testing::Test { // and differently to line and block mode. TEST_F(PageSegModeTest, WordTest) { std::string filename = file::JoinPath(TESTING_DIR, "segmodeimg.tif"); - if (!file_exists(filename.c_str())) { + if (!std::filesystem::exists(filename)) { LOG(INFO) << "Skip test because of missing " << filename << '\n'; GTEST_SKIP(); } else { diff --git a/unittest/paragraphs_test.cc b/unittest/paragraphs_test.cc index fcc54b00e0..a073b9c0f9 100644 --- a/unittest/paragraphs_test.cc +++ b/unittest/paragraphs_test.cc @@ -66,11 +66,11 @@ void AsciiToRowInfo(const char *text, int row_number, RowInfo *info) { info->lword_text = words[0].c_str(); info->rword_text = words[words.size() - 1].c_str(); - int lspace = 0; + size_t lspace = 0; while (lspace < info->text.size() && text[lspace] == ' ') { lspace++; } - int rspace = 0; + size_t rspace = 0; while (rspace < info->text.size() && text[info->text.size() - rspace - 1] == ' ') { rspace++; } diff --git a/unittest/recodebeam_test.cc b/unittest/recodebeam_test.cc index 0792088e09..b504d377ba 100644 --- a/unittest/recodebeam_test.cc +++ b/unittest/recodebeam_test.cc @@ -182,7 +182,7 @@ class RecodeBeamTest : public ::testing::Test { for (int i = 0; i < 2; ++i) { beam_search.ExtractBestPathAsWords(line_box, 1.0f, &ccutil_.unicharset, words); std::string w_decoded; - for (int w = 0; w < words->size(); ++w) { + for (size_t w = 0; w < words->size(); ++w) { const WERD_RES *word = (*words)[w]; if (w_decoded.size() < truth_utf8.size()) { if (!w_decoded.empty() && word->word->space()) { @@ -454,7 +454,7 @@ TEST_F(RecodeBeamTest, DISABLED_ChiDictionary) { ExpectCorrect(outputs, "实学储啬投学生", nullptr, &words); // Each is an individual word, with permuter = top choice. EXPECT_EQ(7, words.size()); - for (int w = 0; w < words.size(); ++w) { + for (size_t w = 0; w < words.size(); ++w) { EXPECT_EQ(TOP_CHOICE_PERM, words[w]->best_choice->permuter()); } // Now try again with the dictionary. @@ -468,7 +468,7 @@ TEST_F(RecodeBeamTest, DISABLED_ChiDictionary) { const int kWordPerms[kNumWords] = {SYSTEM_DAWG_PERM, TOP_CHOICE_PERM, TOP_CHOICE_PERM, TOP_CHOICE_PERM, SYSTEM_DAWG_PERM}; EXPECT_EQ(kNumWords, words.size()); - for (int w = 0; w < kNumWords && w < words.size(); ++w) { + for (size_t w = 0; w < kNumWords && w < words.size(); ++w) { EXPECT_STREQ(kWords[w], words[w]->best_choice->unichar_string().c_str()); EXPECT_EQ(kWordPerms[w], words[w]->best_choice->permuter()); } diff --git a/unittest/tatweel_test.cc b/unittest/tatweel_test.cc index 5ccac37e68..99f05ade40 100644 --- a/unittest/tatweel_test.cc +++ b/unittest/tatweel_test.cc @@ -9,12 +9,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(_WIN32) -# include // for _access -#else -# include // for access -#endif - +#include #include "dawg.h" #include "include_gunit.h" #include "trie.h" @@ -25,15 +20,6 @@ namespace tesseract { -// Replacement for std::filesystem::exists (C++-17) -static bool file_exists(const char *filename) { -#if defined(_WIN32) - return _access(filename, 0) == 0; -#else - return access(filename, 0) == 0; -#endif -} - class TatweelTest : public ::testing::Test { protected: void SetUp() override { @@ -43,7 +29,7 @@ class TatweelTest : public ::testing::Test { TatweelTest() { std::string filename = TestDataNameToPath("ara.wordlist"); - if (file_exists(filename.c_str())) { + if (std::filesystem::exists(filename)) { std::string wordlist(u8"\u0640"); CHECK_OK(file::GetContents(filename, &wordlist, file::Defaults())); // Put all the unicodes in the unicharset_. @@ -69,7 +55,7 @@ class TatweelTest : public ::testing::Test { TEST_F(TatweelTest, UnicharsetIgnoresTatweel) { // This test verifies that the unicharset ignores the Tatweel character. - for (int i = 0; i < unicharset_.size(); ++i) { + for (size_t i = 0; i < unicharset_.size(); ++i) { const char *utf8 = unicharset_.id_to_unichar(i); EXPECT_EQ(strstr(utf8, reinterpret_cast(u8"\u0640")), nullptr); } @@ -79,7 +65,7 @@ TEST_F(TatweelTest, DictIgnoresTatweel) { // This test verifies that the dictionary ignores the Tatweel character. tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "ara", SYSTEM_DAWG_PERM, unicharset_.size(), 0); std::string filename = TestDataNameToPath("ara.wordlist"); - if (!file_exists(filename.c_str())) { + if (!std::filesystem::exists(filename)) { LOG(INFO) << "Skip test because of missing " << filename; GTEST_SKIP(); } else { @@ -93,13 +79,13 @@ TEST_F(TatweelTest, UnicharsetLoadKeepsTatweel) { // This test verifies that a load of an existing unicharset keeps any // existing tatweel for backwards compatibility. std::string filename = TestDataNameToPath("ara.unicharset"); - if (!file_exists(filename.c_str())) { + if (!std::filesystem::exists(filename)) { LOG(INFO) << "Skip test because of missing " << filename; GTEST_SKIP(); } else { EXPECT_TRUE(unicharset_.load_from_file(filename.c_str())); int num_tatweel = 0; - for (int i = 0; i < unicharset_.size(); ++i) { + for (size_t i = 0; i < unicharset_.size(); ++i) { const char *utf8 = unicharset_.id_to_unichar(i); if (strstr(utf8, reinterpret_cast(u8"\u0640")) != nullptr) { ++num_tatweel; diff --git a/unittest/third_party/googletest b/unittest/third_party/googletest index a8b592e7ef..7d76a231b0 160000 --- a/unittest/third_party/googletest +++ b/unittest/third_party/googletest @@ -1 +1 @@ -Subproject commit a8b592e7ef2e79b10f1d74d0b7c3e90975939eb6 +Subproject commit 7d76a231b0e29caf86e68d1df858308cd53b2a66 diff --git a/unittest/third_party/utf/rune.c b/unittest/third_party/utf/rune.c index 4b4f069742..6c4801141c 100644 --- a/unittest/third_party/utf/rune.c +++ b/unittest/third_party/utf/rune.c @@ -12,9 +12,9 @@ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. */ #include +#include #include #include "third_party/utf/utf.h" -#include "third_party/utf/utfdef.h" enum { Bit1 = 7, @@ -74,7 +74,7 @@ int charntorune(Rune *rune, const char *str, int length) { * one character sequence (7-bit value) * 00000-0007F => T1 */ - c = *(uchar *)str; + c = *(uint8_t *)str; if (c < Tx) { *rune = c; return 1; @@ -89,7 +89,7 @@ int charntorune(Rune *rune, const char *str, int length) { * two character sequence (11-bit value) * 0080-07FF => T2 Tx */ - c1 = *(uchar *)(str + 1) ^ Tx; + c1 = *(uint8_t *)(str + 1) ^ Tx; if (c1 & Testx) goto bad; if (c < T3) { @@ -111,7 +111,7 @@ int charntorune(Rune *rune, const char *str, int length) { * three character sequence (16-bit value) * 0800-FFFF => T3 Tx Tx */ - c2 = *(uchar *)(str + 2) ^ Tx; + c2 = *(uint8_t *)(str + 2) ^ Tx; if (c2 & Testx) goto bad; if (c < T4) { @@ -129,7 +129,7 @@ int charntorune(Rune *rune, const char *str, int length) { * four character sequence (21-bit value) * 10000-1FFFFF => T4 Tx Tx Tx */ - c3 = *(uchar *)(str + 3) ^ Tx; + c3 = *(uint8_t *)(str + 3) ^ Tx; if (c3 & Testx) goto bad; if (c < T5) { @@ -168,7 +168,7 @@ int chartorune(Rune *rune, const char *str) { * one character sequence * 00000-0007F => T1 */ - c = *(uchar *)str; + c = *(uint8_t *)str; if (c < Tx) { *rune = c; return 1; @@ -178,7 +178,7 @@ int chartorune(Rune *rune, const char *str) { * two character sequence * 0080-07FF => T2 Tx */ - c1 = *(uchar *)(str + 1) ^ Tx; + c1 = *(uint8_t *)(str + 1) ^ Tx; if (c1 & Testx) goto bad; if (c < T3) { @@ -195,7 +195,7 @@ int chartorune(Rune *rune, const char *str) { * three character sequence * 0800-FFFF => T3 Tx Tx */ - c2 = *(uchar *)(str + 2) ^ Tx; + c2 = *(uint8_t *)(str + 2) ^ Tx; if (c2 & Testx) goto bad; if (c < T4) { @@ -210,7 +210,7 @@ int chartorune(Rune *rune, const char *str) { * four character sequence (21-bit value) * 10000-1FFFFF => T4 Tx Tx Tx */ - c3 = *(uchar *)(str + 3) ^ Tx; + c3 = *(uint8_t *)(str + 3) ^ Tx; if (c3 & Testx) goto bad; if (c < T5) { @@ -304,7 +304,7 @@ int runelen(Rune rune) { int runenlen(const Rune *r, int nrune) { int nb; - ulong c; /* Rune is signed, so use unsigned for range check. */ + unsigned long c; /* Rune is signed, so use unsigned for range check. */ nb = 0; while (nrune--) { @@ -325,7 +325,7 @@ int runenlen(const Rune *r, int nrune) { int fullrune(const char *str, int n) { if (n > 0) { - int c = *(uchar *)str; + int c = *(uint8_t *)str; if (c < Tx) return 1; if (n > 1) { diff --git a/unittest/third_party/utf/utfdef.h b/unittest/third_party/utf/utfdef.h deleted file mode 100644 index deaf396b32..0000000000 --- a/unittest/third_party/utf/utfdef.h +++ /dev/null @@ -1,14 +0,0 @@ -#define uchar _utfuchar -#define ushort _utfushort -#define uint _utfuint -#define ulong _utfulong -#define vlong _utfvlong -#define uvlong _utfuvlong - -typedef unsigned char uchar; -typedef unsigned short ushort; -typedef unsigned int uint; -typedef unsigned long ulong; - -#define nelem(x) (sizeof(x) / sizeof((x)[0])) -#define nil ((void *)0) diff --git a/unittest/unicharcompress_test.cc b/unittest/unicharcompress_test.cc index 953d4cdf1b..b66be07786 100644 --- a/unittest/unicharcompress_test.cc +++ b/unittest/unicharcompress_test.cc @@ -86,7 +86,7 @@ class UnicharcompressTest : public ::testing::Test { } int code_range = compressed_.code_range(); std::vector times_seen(code_range, zeros); - for (int u = 0; u <= unicharset_.size(); ++u) { + for (size_t u = 0; u <= unicharset_.size(); ++u) { if (u != UNICHAR_SPACE && u != null_char_ && (u == unicharset_.size() || (unicharset_.has_special_codes() && u < SPECIAL_UNICHAR_CODES_COUNT))) { @@ -162,7 +162,7 @@ class UnicharcompressTest : public ::testing::Test { UnicharCompress compressed_; UNICHARSET unicharset_; - int null_char_; + size_t null_char_; // The encoding of the null_char_. int encoded_null_char_; }; @@ -216,7 +216,7 @@ TEST_F(UnicharcompressTest, DoesLigaturesWithDoubles) { ExpectCorrect("por"); // Check that any unichar-id that is encoded with multiple codes has the // correct encoded_null_char_ in between. - for (int u = 0; u <= unicharset_.size(); ++u) { + for (size_t u = 0; u <= unicharset_.size(); ++u) { RecodedCharID code; int len = compressed_.EncodeUnichar(u, &code); if (len > 1) {