10#include "lcf/config.h"
11#include "lcf/scope_guard.h"
13#if LCF_SUPPORT_ICU == 1
14# include <unicode/ucsdet.h>
15# include <unicode/ucnv.h>
16# include <unicode/normalizer2.h>
17# include <unicode/ustring.h>
18#elif LCF_SUPPORT_ICU == 2
20# error "icu.h only supported on Windows"
37#include "lcf/encoder.h"
38#include "lcf/inireader.h"
39#include "lcf/ldb/reader.h"
40#include "lcf/reader_util.h"
48std::string ReaderUtil::CodepageToEncoding(
int codepage) {
52 if (codepage == 932) {
53 return "ibm-943_P15A-2003";
55 if (codepage == 949) {
56 return "windows-949-2000";
59 return "windows-" + std::to_string(codepage);
62std::string ReaderUtil::DetectEncoding(lcf::rpg::Database& db) {
63 std::vector<std::string> encodings = DetectEncodings(db);
65 if (encodings.empty()) {
69 return encodings.front();
72std::vector<std::string> ReaderUtil::DetectEncodings(lcf::rpg::Database& db) {
74 std::ostringstream text;
76 auto append = [](
const auto& s) {
77 return ToString(s) +
" ";
80 lcf::rpg::ForEachString(db.system, [&](
const auto& val,
const auto&) {
98 db.terms.health_points,
99 db.terms.spirit_points,
100 db.terms.normal_status,
111 db.terms.save_game_message,
112 db.terms.load_game_message,
113 db.terms.exit_game_message,
121 return ReaderUtil::DetectEncodings(text.str());
123 return {
"windows-1252"};
127std::string ReaderUtil::DetectEncoding(std::string_view
string) {
128 std::vector<std::string> encodings = DetectEncodings(
string);
130 if (encodings.empty()) {
134 return encodings.front();
137std::vector<std::string> ReaderUtil::DetectEncodings(std::string_view
string) {
138 std::vector<std::string> encodings;
140 if (!
string.empty()) {
141 UErrorCode status = U_ZERO_ERROR;
142 UCharsetDetector* detector = ucsdet_open(&status);
144 auto s = std::string(
string);
147 int32_t matches_count;
148 const UCharsetMatch** matches =
nullptr;
151 ucsdet_setText(detector, s.c_str(), s.length(), &status);
152 matches = ucsdet_detectAll(detector, &matches_count, &status);
154 if (!matches || matches_count < 1) {
158 confidence = ucsdet_getConfidence(matches[0], &status);
160 if (confidence > 70 || s.length() > 100) {
168 if (matches !=
nullptr) {
171 for (
int i = 0; i < matches_count; ++i) {
172 std::string encoding = ucsdet_getName(matches[i], &status);
175 if (encoding ==
"Shift_JIS") {
176 encodings.emplace_back(
"ibm-943_P15A-2003");
177 }
else if (encoding ==
"EUC-KR") {
178 encodings.emplace_back(
"windows-949-2000");
179 }
else if (encoding ==
"GB18030") {
180 encodings.emplace_back(
"windows-936-2000");
181 }
else if (encoding ==
"ISO-8859-1" || encoding ==
"windows-1252") {
182 encodings.emplace_back(
"ibm-5348_P100-1997");
183 }
else if (encoding ==
"ISO-8859-2" || encoding ==
"windows-1250") {
184 encodings.emplace_back(
"ibm-5346_P100-1998");
185 }
else if (encoding ==
"ISO-8859-5" || encoding ==
"windows-1251") {
186 encodings.emplace_back(
"ibm-5347_P100-1998");
187 }
else if (encoding ==
"ISO-8859-6" || encoding ==
"windows-1256") {
188 encodings.emplace_back(
"ibm-9448_X100-2005");
189 }
else if (encoding ==
"ISO-8859-7" || encoding ==
"windows-1253") {
190 encodings.emplace_back(
"ibm-5349_P100-1998");
191 }
else if (encoding ==
"ISO-8859-8" || encoding ==
"windows-1255") {
192 encodings.emplace_back(
"ibm-9447_P100-2002");
193 }
else if (encoding ==
"UTF-16BE" || encoding ==
"UTF-16LE") {
196 encodings.push_back(encoding);
200 ucsdet_close(detector);
203 encodings.push_back(
"windows-1252");
209std::string ReaderUtil::GetEncoding(std::string_view ini_file) {
211 INIReader ini(ToString(ini_file));
212 if (ini.ParseError() != -1) {
213 auto encoding = ini.Get(
"EasyRPG",
"Encoding",
"");
214 if (!encoding.empty()) {
215 return ReaderUtil::CodepageToEncoding(atoi(std::string(encoding).c_str()));
219 Log::Warning(
"Could not get encoding from ini file, disabled in this liblcf build.");
224std::string ReaderUtil::GetEncoding(std::istream& filestream) {
226 INIReader ini(filestream);
227 if (ini.ParseError() != -1) {
228 auto encoding = ini.Get(
"EasyRPG",
"Encoding",
"");
229 if (!encoding.empty()) {
230 return ReaderUtil::CodepageToEncoding(atoi(std::string(encoding).c_str()));
234 Log::Warning(
"Could not get encoding from ini file, disabled in this liblcf build.");
239std::string ReaderUtil::GetLocaleEncoding() {
241 int codepage = GetACP();
249 std::locale loc = std::locale(
"");
251 std::string loc_full = loc.name().substr(0, loc.name().find_first_of(
"@."));
253 std::string loc_lang = loc.name().substr(0, loc.name().find_first_of(
"_"));
255 if (loc_lang ==
"th") codepage = 874;
256 else if (loc_lang ==
"ja") codepage = 932;
257 else if (loc_full ==
"zh_CN" ||
258 loc_full ==
"zh_SG") codepage = 936;
259 else if (loc_lang ==
"ko") codepage = 949;
260 else if (loc_full ==
"zh_TW" ||
261 loc_full ==
"zh_HK") codepage = 950;
262 else if (loc_lang ==
"cs" ||
268 loc_lang ==
"sl") codepage = 1250;
269 else if (loc_lang ==
"ru") codepage = 1251;
270 else if (loc_lang ==
"ca" ||
282 loc_lang ==
"eu") codepage = 1252;
283 else if (loc_lang ==
"el") codepage = 1253;
284 else if (loc_lang ==
"tr") codepage = 1254;
285 else if (loc_lang ==
"he") codepage = 1255;
286 else if (loc_lang ==
"ar") codepage = 1256;
287 else if (loc_lang ==
"et" ||
289 loc_lang ==
"lv") codepage = 1257;
290 else if (loc_lang ==
"vi") codepage = 1258;
293 return CodepageToEncoding(codepage);
296std::string ReaderUtil::Recode(std::string_view str_to_encode, std::string_view source_encoding) {
298 std::string out = ToString(str_to_encode);
303std::string ReaderUtil::Normalize(std::string_view str) {
309 UErrorCode err = U_ZERO_ERROR;
311 auto log_warning = [err, &str](
const char* func_name) {
312 Log::Error(
"%s failed while normalizing \"%s\": %s", func_name, std::string(str).c_str(), u_errorName(err));
313 return std::string(str);
316 std::vector<UChar> uni(str.length() + 1);
318 u_strFromUTF8Lenient(uni.data(), uni.size(), &uni_length, str.data(), str.length(), &err);
319 if (U_FAILURE(err)) {
320 return log_warning(
"u_strFromUTF8Lenient");
323 uni_length = u_strToLower(uni.data(), uni.size(), uni.data(), uni_length,
"", &err);
324 if (U_FAILURE(err)) {
325 return log_warning(
"u_strToLower");
328 std::vector<char> res;
329 int res_capac = uni.size() * 4 + 1;
330 res.resize(res_capac);
332 const UNormalizer2* norm = unorm2_getNFKCInstance(&err);
333 if (U_FAILURE(err)) {
334 static bool err_reported =
false;
336 Log::Error(
"Normalizer2::getNFKCInstance failed (%s). \"nrm\" is probably missing in the ICU data file. Unicode normalization will not work!", u_errorName(err));
342 u_strToUTF8(res.data(), res_capac, &uni_length, uni.data(), uni_length, &err);
343 if (U_FAILURE(err)) {
344 return log_warning(
"u_strToUTF8 (1)");
347 return std::string(res.data(), uni_length);
350 std::vector<UChar> uni_norm(uni_length * 2 + 1);
351 auto uni_norm_length = unorm2_normalize(norm, uni.data(), uni_length, uni_norm.data(), uni_norm.size(), &err);
353 if (U_FAILURE(err)) {
354 log_warning(
"unorm2_normalize");
359 u_strToUTF8(res.data(), res_capac, &uni_length, uni.data(), uni_length, &err);
360 if (U_FAILURE(err)) {
361 return log_warning(
"u_strToUTF8 (2)");
365 u_strToUTF8(res.data(), res_capac, &uni_length, uni_norm.data(), uni_norm_length, &err);
366 if (U_FAILURE(err)) {
367 return log_warning(
"u_strToUTF8 (3)");
371 return std::string(res.data(), uni_length);
373 auto result = std::string(str);
374 std::transform(result.begin(), result.end(), result.begin(), tolower);
void Warning(const char *fmt,...) LIKE_PRINTF
void Error(const char *fmt,...) LIKE_PRINTF