icu示例

This commit is contained in:
taynpg 2025-08-03 11:06:19 +08:00
parent 3efec5c85f
commit 4884ba8e40

65
example/cpp/icu.cpp Normal file
View File

@ -0,0 +1,65 @@
#include <fstream>
#include <iostream>
#include <set>
#include <unicode/uchar.h>
#include <unicode/unistr.h>
#include <unicode/ustring.h>
#include <vector>
// 判断是否为中文字符(使用 ICU 内置属性)
bool is_chinese_char(UChar32 c)
{
return u_hasBinaryProperty(c, UCHAR_IDEOGRAPHIC);
}
// 提取并去重中文字符
std::set<UChar32> extract_chinese_chars(const icu::UnicodeString& ustr)
{
std::set<UChar32> result;
for (int32_t i = 0; i < ustr.length();) {
UChar32 c = ustr.char32At(i); // 自动处理代理对(Surrogate Pairs)
i += U16_LENGTH(c); // 前进 1 或 2 个 UTF-16 单元
if (is_chinese_char(c)) {
result.insert(c);
}
}
return result;
}
int main(int argc, char** argv)
{
if (argc != 3) {
std::cerr << "Usage: " << argv[0] << " <input.txt> <output.txt>\n";
return 1;
}
// 1. 读取文件内容
std::ifstream input(argv[1]);
if (!input) {
std::cerr << "Error opening input file\n";
return 1;
}
std::string contents((std::istreambuf_iterator<char>(input)), std::istreambuf_iterator<char>());
// 2. 转换为 ICU 的 UnicodeString(自动检测 UTF-8)
icu::UnicodeString ustr = icu::UnicodeString::fromUTF8(contents);
// 3. 提取中文字符
auto chinese_chars = extract_chinese_chars(ustr);
// 4. 写入输出文件(UTF-8 编码)
std::ofstream output(argv[2]);
if (!output) {
std::cerr << "Error opening output file\n";
return 1;
}
for (UChar32 c : chinese_chars) {
icu::UnicodeString temp(c);
std::string utf8;
temp.toUTF8String(utf8);
output << utf8;
}
std::cout << "Extracted " << chinese_chars.size() << " unique Chinese characters to " << argv[2] << std::endl;
return 0;
}