Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modernize datadir #4372

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions src/api/baseapi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,9 @@ static void ExtractFontName(const char* filename, std::string* fontname) {
*/
static void addAvailableLanguages(const std::string &datadir,
std::vector<std::string> *langs) {
if (!std::filesystem::is_directory(datadir))
return;

for (const auto& entry :
std::filesystem::recursive_directory_iterator(datadir,
std::filesystem::directory_options::follow_directory_symlink |
Expand Down Expand Up @@ -347,7 +350,7 @@ int TessBaseAPI::Init(const char *data, int data_size, const char *language, Ocr
// Update datapath and language requested for the last valid initialization.
datapath_ = std::move(datapath);
if (datapath_.empty() && !tesseract_->datadir.empty()) {
datapath_ = tesseract_->datadir;
datapath_ = tesseract_->datadir.string();
}

language_ = language;
Expand Down Expand Up @@ -396,7 +399,7 @@ void TessBaseAPI::GetLoadedLanguagesAsVector(std::vector<std::string> *langs) co
void TessBaseAPI::GetAvailableLanguagesAsVector(std::vector<std::string> *langs) const {
langs->clear();
if (tesseract_ != nullptr) {
addAvailableLanguages(tesseract_->datadir, langs);
addAvailableLanguages(tesseract_->datadir.string(), langs);
std::sort(langs->begin(), langs->end());
}
}
Expand Down Expand Up @@ -858,7 +861,7 @@ const char *TessBaseAPI::GetInputName() {
}

const char *TessBaseAPI::GetDatapath() {
return tesseract_->datadir.c_str();
return datapath_.c_str();
}

int TessBaseAPI::GetSourceYResolution() {
Expand Down
2 changes: 1 addition & 1 deletion src/ccmain/paramsd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ ParamsEditor::ParamsEditor(tesseract::Tesseract *tess, ScrollView *sv) {
SVMenuNode *svMenuRoot = BuildListOfAllLeaves(tess);

std::string paramfile;
paramfile = tess->datadir;
paramfile = tess->datadir.string();
paramfile += VARDIR; // parameters dir
paramfile += "edited"; // actual name

Expand Down
49 changes: 23 additions & 26 deletions src/ccmain/tessedit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,24 +43,25 @@
// Read a "config" file containing a set of variable, value pairs.
// Searches the standard places: tessdata/configs, tessdata/tessconfigs
// and also accepts a relative or absolute path name.
void Tesseract::read_config_file(const char *filename, SetParamConstraint constraint) {
std::string path = datadir;
path += "configs/";
path += filename;
FILE *fp;
if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
fclose(fp);
} else {
path = datadir;
path += "tessconfigs/";
path += filename;
if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
fclose(fp);
} else {
path = filename;
}
}
ParamUtils::ReadParamsFile(path.c_str(), constraint, this->params());
void Tesseract::read_config_file(const char *filename,
SetParamConstraint constraint) {
// Construct potential config file paths
std::vector<std::filesystem::path> config_paths = {
datadir / "configs" / filename,
datadir / "tessconfigs" / filename,
std::filesystem::path(filename)};

// Use the first existing file or fallback to the last (filename)
auto config_file = std::find_if(config_paths.begin(), config_paths.end(),
[](const std::filesystem::path &path) {
std::error_code ec;
return std::filesystem::exists(path, ec);
});
const std::filesystem::path &selected_path =
(config_file != config_paths.end()) ? *config_file : config_paths.back();

ParamUtils::ReadParamsFile(selected_path.string().c_str(), constraint,
this->params());
}

// Returns false if a unicharset file for the specified language was not found
Expand All @@ -81,14 +82,11 @@
bool set_only_non_debug_params, TessdataManager *mgr) {
// Set the language data path prefix
lang = !language.empty() ? language : "eng";
language_data_path_prefix = datadir;
language_data_path_prefix += lang;
language_data_path_prefix += ".";
std::filesystem::path tessdata_path = datadir / (lang + "." + kTrainedDataSuffix);

// Initialize TessdataManager.
std::string tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) {
tprintf("Error opening data file %s\n", tessdata_path.c_str());
if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string().c_str())) {
tprintf("Error opening data file %s\n", tessdata_path.string());
Fixed Show fixed Hide fixed
stweil marked this conversation as resolved.
Show resolved Hide resolved
tprintf(
"Please make sure the TESSDATA_PREFIX environment variable is set"
" to your \"tessdata\" directory.\n");
Expand Down Expand Up @@ -186,8 +184,7 @@
else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || !unicharset.load_from_file(&fp, false)) {
tprintf(
stweil marked this conversation as resolved.
Show resolved Hide resolved
"Error: Tesseract (legacy) engine requested, but components are "
"not present in %s!!\n",
tessdata_path.c_str());
"not present in %s!!\n", tessdata_path.string());
Fixed Show fixed Hide fixed
return false;
}
#endif // ndef DISABLED_LEGACY_ENGINE
Expand Down
113 changes: 58 additions & 55 deletions src/ccutil/ccutil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
#include "tprintf.h" // for tprintf

#include <cstdlib>
#include <cstring> // for std::strrchrA
#include <filesystem> // for std::filesystem

namespace tesseract {

Expand All @@ -33,68 +31,73 @@
CCUtil::~CCUtil() = default;

/**
* @brief CCUtil::main_setup - set location of tessdata and name of image
* @brief Finds the path to the tessdata directory.
*
* @param argv0 - paths to the directory with language files and config files.
* An actual value of argv0 is used if not nullptr, otherwise TESSDATA_PREFIX is
* used if not nullptr, next try to use compiled in -DTESSDATA_PREFIX. If
* previous is not successful - use current directory.
* @param basename - name of image
* This function determines the location of the tessdata directory based on the
* following order of precedence:
* 1. If `argv0` is provided, use it.
* 2. If `TESSDATA_PREFIX` environment variable is set and the path exists, use
* it.
* 3. On Windows, check for a "tessdata" directory in the executable's directory
* and use it.
* 4. If `TESSDATA_PREFIX` is defined at compile time, use it.
* 5. Otherwise, use the current working directory.
*
* @param argv0 argument to be considered as the data directory path.
* @return The path to the tessdata directory or current directory.
*/
void CCUtil::main_setup(const std::string &argv0, const std::string &basename) {
imagebasename = basename; /**< name of image */

const char *tessdata_prefix = getenv("TESSDATA_PREFIX");

// Ignore TESSDATA_PREFIX if there is no matching filesystem entry.
if (tessdata_prefix != nullptr && !std::filesystem::exists(tessdata_prefix)) {
tprintf("Warning: TESSDATA_PREFIX %s does not exist, ignore it\n", tessdata_prefix);
tessdata_prefix = nullptr;
}

std::filesystem::path find_data_path(const std::string &argv0) {
// If argv0 is set, always use it even if it is not a valid directory
if (!argv0.empty()) {
/* Use tessdata prefix from the command line. */
datadir = argv0;
} else if (tessdata_prefix) {
/* Use tessdata prefix from the environment. */
datadir = tessdata_prefix;
#if defined(_WIN32)
} else if (datadir.empty() || !std::filesystem::exists(datadir)) {
/* Look for tessdata in directory of executable. */
char path[_MAX_PATH];
DWORD length = GetModuleFileName(nullptr, path, sizeof(path));
if (length > 0 && length < sizeof(path)) {
char *separator = std::strrchr(path, '\\');
if (separator != nullptr) {
*separator = '\0';
std::string subdir = path;
subdir += "/tessdata";
if (std::filesystem::exists(subdir)) {
datadir = subdir;
}
}
std::filesystem::path path(argv0);
if (!std::filesystem::is_directory(path)) {
tprintf("Warning (tessdata): '%s' is not a valid directory.\n",
argv0);
Fixed Show fixed Hide fixed
}
#endif /* _WIN32 */
return path;
}

// datadir may still be empty:
if (datadir.empty()) {
#if defined(TESSDATA_PREFIX)
// Use tessdata prefix which was compiled in.
datadir = TESSDATA_PREFIX "/tessdata/";
// Note that some software (for example conda) patches TESSDATA_PREFIX
// in the binary, so it might be shorter. Recalculate its length.
datadir.resize(std::strlen(datadir.c_str()));
#else
datadir = "./";
#endif /* TESSDATA_PREFIX */
// Check environment variable if argv0 is not specified
if (const char *tessdata_prefix = std::getenv("TESSDATA_PREFIX")) {
std::filesystem::path path(tessdata_prefix);
if (std::filesystem::exists(path)) {
return path;
} else {
tprintf("Warning: TESSDATA_PREFIX %s does not exist, ignoring.\n",
tessdata_prefix);
}
}

// check for missing directory separator
const char lastchar = datadir.back();
if (lastchar != '/' && lastchar != '\\') {
datadir += '/';
#ifdef _WIN32
// Windows-specific: check for 'tessdata' not existing in the executable
// directory
wchar_t path[MAX_PATH];
if (DWORD length = GetModuleFileNameW(nullptr, path, MAX_PATH);
length > 0 && length < MAX_PATH) {
std::filesystem::path exe_path(path);
auto tessdata_subdir = exe_path.parent_path() / "tessdata";
if (std::filesystem::exists(tessdata_subdir)) {
return tessdata_subdir;
}
}
#endif

// Fallback to compile-time or current directory
#ifdef TESSDATA_PREFIX
return std::filesystem::path(TESSDATA_PREFIX) / "tessdata";
#else
return std::filesystem::current_path();
#endif
}


/**
* @brief CCUtil::main_setup - set location of tessdata and name of image
*
* @param argv0 - paths to the directory with language files and config files.
*/
void CCUtil::main_setup(const std::string &argv0, const std::string &basename) {
imagebasename = basename; /**< name of image */
datadir = find_data_path(argv0);
}
} // namespace tesseract
7 changes: 4 additions & 3 deletions src/ccutil/ccutil.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#ifndef TESSERACT_CCUTIL_CCUTIL_H_
#define TESSERACT_CCUTIL_CCUTIL_H_

#include <filesystem> // for std::filesystem

#ifndef _WIN32
# include <pthread.h>
# include <semaphore.h>
Expand Down Expand Up @@ -53,9 +55,8 @@ class TESS_API CCUtil {
ParamsVectors *params() {
return &params_;
}

std::string datadir; // dir for data files
std::string imagebasename; // name of image
std::filesystem::path datadir; // dir for data files
std::string imagebasename; // name of image
std::string lang;
std::string language_data_path_prefix;
UNICHARSET unicharset;
Expand Down
Loading