From 575114661ef9afb95df2a211e1d8498686340e6b Mon Sep 17 00:00:00 2001 From: HsiangNianian Date: Tue, 30 Dec 2025 19:54:08 +0800 Subject: feat: Refactor and enhance TRPG NER model SDK - Removed deprecated `word_conll_to_char_conll.py` utility and integrated its functionality into the new `utils` module. - Introduced a comprehensive GitHub Actions workflow for automated publishing to PyPI and GitHub Releases. - Added `__init__.py` files to establish package structure for `basemodel`, `inference`, `training`, and `utils` modules. - Implemented model downloading functionality in `download_model.py` to fetch pre-trained ONNX models. - Developed `TRPGParser` class for ONNX-based inference, including methods for parsing TRPG logs. - Created training utilities in `training/__init__.py` for NER model training with Hugging Face Transformers. - Enhanced utility functions for CoNLL file parsing and dataset creation. - Added command-line interface for converting CoNLL files to datasets with validation options. --- src/utils/word_conll_to_char_conll.py | 55 ----------------------------------- 1 file changed, 55 deletions(-) delete mode 100644 src/utils/word_conll_to_char_conll.py (limited to 'src/utils/word_conll_to_char_conll.py') diff --git a/src/utils/word_conll_to_char_conll.py b/src/utils/word_conll_to_char_conll.py deleted file mode 100644 index e52405f..0000000 --- a/src/utils/word_conll_to_char_conll.py +++ /dev/null @@ -1,55 +0,0 @@ -def word_conll_to_char_conll(word_conll_lines: list[str]) -> list[str]: - char_lines = [] - in_new_sample = True # 下一行是否应视为新样本开始 - - for line in word_conll_lines: - stripped = line.strip() - if not stripped: - # 空行 → 标记下一句为新样本 - in_new_sample = True - char_lines.append("") - continue - - parts = stripped.split() - if len(parts) < 4: - char_lines.append(line.rstrip()) - continue - - token, label = parts[0], parts[3] - - # 检测新发言:B-speaker 出现 → 新样本 - if label == "B-speaker" and in_new_sample: - char_lines.append("-DOCSTART- -X- O") - in_new_sample = False - - # 转换 token → char labels(同前) - if label == "O": - for c in token: - char_lines.append(f"{c} -X- _ O") - else: - bio_prefix = label[:2] - tag = label[2:] - for i, c in enumerate(token): - char_label = f"B-{tag}" if (bio_prefix == "B-" and i == 0) else f"I-{tag}" - char_lines.append(f"{c} -X- _ {char_label}") - - return char_lines - -if __name__ == "__main__": - import sys - if len(sys.argv) < 3: - print("Usage: python word_conll_to_char_conll.py ") - sys.exit(1) - - input_fp = sys.argv[1] - output_fp = sys.argv[2] - - with open(input_fp, "r", encoding="utf-8") as f: - word_conll_lines = f.readlines() - - char_conll_lines = word_conll_to_char_conll(word_conll_lines) - - with open(output_fp, "w", encoding="utf-8") as f: - f.write("\n".join(char_conll_lines) + "\n") - - print(f"Converted {input_fp} to character-level CoNLL format at {output_fp}") \ No newline at end of file -- cgit v1.2.3-70-g09d2