From 4884ba8e40de0d213424475282dc9f2811f266d6 Mon Sep 17 00:00:00 2001
From: taynpg <taynpg@163.com>
Date: Sun, 3 Aug 2025 11:06:19 +0800
Subject: [PATCH] =?UTF-8?q?icu=E7=A4=BA=E4=BE=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 example/cpp/icu.cpp | 65 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 example/cpp/icu.cpp
diff --git a/example/cpp/icu.cpp b/example/cpp/icu.cpp
new file mode 100644
index 0000000..69af714
--- /dev/null
+++ b/example/cpp/icu.cpp
@@ -0,0 +1,65 @@
+#include <fstream>
+#include <iostream>
+#include <set>
+#include <unicode/uchar.h>
+#include <unicode/unistr.h>
+#include <unicode/ustring.h>
+#include <vector>
+
+// 判断是否为中文字符（使用 ICU 内置属性）
+bool is_chinese_char(UChar32 c)
+{
+    return u_hasBinaryProperty(c, UCHAR_IDEOGRAPHIC);
+}
+
+// 提取并去重中文字符
+std::set<UChar32> extract_chinese_chars(const icu::UnicodeString& ustr)
+{
+    std::set<UChar32> result;
+    for (int32_t i = 0; i < ustr.length();) {
+        UChar32 c = ustr.char32At(i);   // 自动处理代理对（Surrogate Pairs）
+        i += U16_LENGTH(c);             // 前进 1 或 2 个 UTF-16 单元
+        if (is_chinese_char(c)) {
+            result.insert(c);
+        }
+    }
+    return result;
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 3) {
+        std::cerr << "Usage: " << argv[0] << " <input.txt> <output.txt>\n";
+        return 1;
+    }
+
+    // 1. 读取文件内容
+    std::ifstream input(argv[1]);
+    if (!input) {
+        std::cerr << "Error opening input file\n";
+        return 1;
+    }
+    std::string contents((std::istreambuf_iterator<char>(input)), std::istreambuf_iterator<char>());
+
+    // 2. 转换为 ICU 的 UnicodeString（自动检测 UTF-8）
+    icu::UnicodeString ustr = icu::UnicodeString::fromUTF8(contents);
+
+    // 3. 提取中文字符
+    auto chinese_chars = extract_chinese_chars(ustr);
+
+    // 4. 写入输出文件（UTF-8 编码）
+    std::ofstream output(argv[2]);
+    if (!output) {
+        std::cerr << "Error opening output file\n";
+        return 1;
+    }
+    for (UChar32 c : chinese_chars) {
+        icu::UnicodeString temp(c);
+        std::string utf8;
+        temp.toUTF8String(utf8);
+        output << utf8;
+    }
+
+    std::cout << "Extracted " << chinese_chars.size() << " unique Chinese characters to " << argv[2] << std::endl;
+    return 0;
+}
\ No newline at end of file