icu示例
This commit is contained in:
parent
3efec5c85f
commit
4884ba8e40
65
example/cpp/icu.cpp
Normal file
65
example/cpp/icu.cpp
Normal file
@ -0,0 +1,65 @@
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
#include <unicode/uchar.h>
|
||||
#include <unicode/unistr.h>
|
||||
#include <unicode/ustring.h>
|
||||
#include <vector>
|
||||
|
||||
// 判断是否为中文字符(使用 ICU 内置属性)
|
||||
bool is_chinese_char(UChar32 c)
|
||||
{
|
||||
return u_hasBinaryProperty(c, UCHAR_IDEOGRAPHIC);
|
||||
}
|
||||
|
||||
// 提取并去重中文字符
|
||||
std::set<UChar32> extract_chinese_chars(const icu::UnicodeString& ustr)
|
||||
{
|
||||
std::set<UChar32> result;
|
||||
for (int32_t i = 0; i < ustr.length();) {
|
||||
UChar32 c = ustr.char32At(i); // 自动处理代理对(Surrogate Pairs)
|
||||
i += U16_LENGTH(c); // 前进 1 或 2 个 UTF-16 单元
|
||||
if (is_chinese_char(c)) {
|
||||
result.insert(c);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
if (argc != 3) {
|
||||
std::cerr << "Usage: " << argv[0] << " <input.txt> <output.txt>\n";
|
||||
return 1;
|
||||
}
|
||||
|
||||
// 1. 读取文件内容
|
||||
std::ifstream input(argv[1]);
|
||||
if (!input) {
|
||||
std::cerr << "Error opening input file\n";
|
||||
return 1;
|
||||
}
|
||||
std::string contents((std::istreambuf_iterator<char>(input)), std::istreambuf_iterator<char>());
|
||||
|
||||
// 2. 转换为 ICU 的 UnicodeString(自动检测 UTF-8)
|
||||
icu::UnicodeString ustr = icu::UnicodeString::fromUTF8(contents);
|
||||
|
||||
// 3. 提取中文字符
|
||||
auto chinese_chars = extract_chinese_chars(ustr);
|
||||
|
||||
// 4. 写入输出文件(UTF-8 编码)
|
||||
std::ofstream output(argv[2]);
|
||||
if (!output) {
|
||||
std::cerr << "Error opening output file\n";
|
||||
return 1;
|
||||
}
|
||||
for (UChar32 c : chinese_chars) {
|
||||
icu::UnicodeString temp(c);
|
||||
std::string utf8;
|
||||
temp.toUTF8String(utf8);
|
||||
output << utf8;
|
||||
}
|
||||
|
||||
std::cout << "Extracted " << chinese_chars.size() << " unique Chinese characters to " << argv[2] << std::endl;
|
||||
return 0;
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user