From 4884ba8e40de0d213424475282dc9f2811f266d6 Mon Sep 17 00:00:00 2001 From: taynpg Date: Sun, 3 Aug 2025 11:06:19 +0800 Subject: [PATCH] =?UTF-8?q?icu=E7=A4=BA=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- example/cpp/icu.cpp | 65 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 example/cpp/icu.cpp diff --git a/example/cpp/icu.cpp b/example/cpp/icu.cpp new file mode 100644 index 0000000..69af714 --- /dev/null +++ b/example/cpp/icu.cpp @@ -0,0 +1,65 @@ +#include +#include +#include +#include +#include +#include +#include + +// 判断是否为中文字符(使用 ICU 内置属性) +bool is_chinese_char(UChar32 c) +{ + return u_hasBinaryProperty(c, UCHAR_IDEOGRAPHIC); +} + +// 提取并去重中文字符 +std::set extract_chinese_chars(const icu::UnicodeString& ustr) +{ + std::set result; + for (int32_t i = 0; i < ustr.length();) { + UChar32 c = ustr.char32At(i); // 自动处理代理对(Surrogate Pairs) + i += U16_LENGTH(c); // 前进 1 或 2 个 UTF-16 单元 + if (is_chinese_char(c)) { + result.insert(c); + } + } + return result; +} + +int main(int argc, char** argv) +{ + if (argc != 3) { + std::cerr << "Usage: " << argv[0] << " \n"; + return 1; + } + + // 1. 读取文件内容 + std::ifstream input(argv[1]); + if (!input) { + std::cerr << "Error opening input file\n"; + return 1; + } + std::string contents((std::istreambuf_iterator(input)), std::istreambuf_iterator()); + + // 2. 转换为 ICU 的 UnicodeString(自动检测 UTF-8) + icu::UnicodeString ustr = icu::UnicodeString::fromUTF8(contents); + + // 3. 提取中文字符 + auto chinese_chars = extract_chinese_chars(ustr); + + // 4. 写入输出文件(UTF-8 编码) + std::ofstream output(argv[2]); + if (!output) { + std::cerr << "Error opening output file\n"; + return 1; + } + for (UChar32 c : chinese_chars) { + icu::UnicodeString temp(c); + std::string utf8; + temp.toUTF8String(utf8); + output << utf8; + } + + std::cout << "Extracted " << chinese_chars.size() << " unique Chinese characters to " << argv[2] << std::endl; + return 0; +} \ No newline at end of file