You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
import re
import os
from pypinyin import pinyin, Style
from tqdm import tqdm
# 读取原始文件,处理并写入对应的txt文件
def process_file(input_file, output_file):
with open(input_file, 'r', encoding='utf-8') as f_in, \
open(output_file, 'w', encoding='utf-8') as f_out:
lines = f_in.readlines()
for line in tqdm(lines, desc=f"Processing {input_file}", unit="line"):
line = line.strip() # 去除行尾换行符和首尾的空白
if not line or line.startswith('#'): # 跳过空白行或以#开头的行
continue
# 检查行中是否包含[a-z]
if re.search(r'[a-z]', line):
f_out.write(line + '\n') # 如果包含[a-z],则直接写入
continue
# 以\t切分
parts = line.split('\t')
if parts:
# 对切分后的列表第一项去除首尾空白,并标注汉语拼音
first_part = parts[0].strip()
pinyin_list = pinyin(first_part, style=Style.NORMAL)
pinyin_str = ' '.join(p[0] for p in pinyin_list)
# 拼接新的字符串,并写入对应的txt文件
output_line = f"{first_part}\t{pinyin_str}"
if len(parts) > 1:
output_line += f"\t{parts[1].strip()}" # 也去除第二部分首尾的空白
if len(parts) > 2:
output_line += f"\t{parts[2].strip()}" # 去除第三部分首尾的空白
f_out.write(output_line + '\n')
# 遍历当前目录下所有的.yaml文件,并一一处理
def process_all_yaml_files():
current_directory = os.getcwd() # 获取当前工作目录
for filename in os.listdir(current_directory):
if filename.endswith('.yaml'):
input_file = os.path.join(current_directory, filename)
output_file = os.path.splitext(input_file)[0] + '.txt' # 替换.yaml为.txt
process_file(input_file, output_file)
# 调用函数处理所有文件
process_all_yaml_files()
处理完之后,加回表头,表头要定义【词条列】、【编码列】、一个典型的表头如下:
# Rime dictionary
# encoding: utf-8
# 五笔小筑修正 2024-0526
# 务必存在编码列
# 表头格式要规范
# 码表主体,不要写注释
---
name: others
version: "2024-05-21"
sort: by_weight
columns:
- text
- code
- weight
- stem
...
空落落 kong luo luo
空落落 kong lao lao
空落落的 kong luo luo de
空落落的 kong lao lao de
阿房宫 e pang gong
reacted with thumbs up emoji reacted with thumbs down emoji reacted with laugh emoji reacted with hooray emoji reacted with confused emoji reacted with heart emoji reacted with rocket emoji reacted with eyes emoji
-
很多拼音方案的码表非常放飞自我,因为有 librime 兜底,基本上词库都是处于没有编码的光棍司令状态。
Rimetool 是 yaml 解析工具,不是方案编译工具,需要编码作为索引的一部分。
希望拼音方案维护者们,把词库做规范一点,至少编码列要有吧?
还有……不要在【码表正文】上随便添加注释符。
我这里写了个 python 脚本,可以批处理一下。
首先,把所有【拼音码表】集中放到一个地方,手动去除【表头】,然后如下处理:
处理完之后,加回表头,表头要定义【词条列】、【编码列】、一个典型的表头如下:
Beta Was this translation helpful? Give feedback.
All reactions