From 7ac684f1f82023c6284cd7d7efde11b8dc98c149 Mon Sep 17 00:00:00 2001 From: HsiangNianian Date: Tue, 30 Dec 2025 19:14:39 +0800 Subject: feat: Implement TRPG NER training and inference script with robust model path detection and enhanced timestamp/speaker handling - Added main training and inference logic in main.py, including CoNLL parsing, tokenization, and model training. - Introduced TRPGParser class for inference with entity aggregation and special handling for timestamps and speakers. - Developed utility functions for converting word-level CoNLL to char-level and saving datasets in various formats. - Added ONNX export functionality for the trained model. - Created a comprehensive requirements.txt and updated pyproject.toml with necessary dependencies. - Implemented tests for ONNX inference to validate model outputs. --- src/utils/word_conll_to_char_conll.py | 55 +++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 src/utils/word_conll_to_char_conll.py (limited to 'src/utils/word_conll_to_char_conll.py') diff --git a/src/utils/word_conll_to_char_conll.py b/src/utils/word_conll_to_char_conll.py new file mode 100644 index 0000000..e52405f --- /dev/null +++ b/src/utils/word_conll_to_char_conll.py @@ -0,0 +1,55 @@ +def word_conll_to_char_conll(word_conll_lines: list[str]) -> list[str]: + char_lines = [] + in_new_sample = True # 下一行是否应视为新样本开始 + + for line in word_conll_lines: + stripped = line.strip() + if not stripped: + # 空行 → 标记下一句为新样本 + in_new_sample = True + char_lines.append("") + continue + + parts = stripped.split() + if len(parts) < 4: + char_lines.append(line.rstrip()) + continue + + token, label = parts[0], parts[3] + + # 检测新发言:B-speaker 出现 → 新样本 + if label == "B-speaker" and in_new_sample: + char_lines.append("-DOCSTART- -X- O") + in_new_sample = False + + # 转换 token → char labels(同前) + if label == "O": + for c in token: + char_lines.append(f"{c} -X- _ O") + else: + bio_prefix = label[:2] + tag = label[2:] + for i, c in enumerate(token): + char_label = f"B-{tag}" if (bio_prefix == "B-" and i == 0) else f"I-{tag}" + char_lines.append(f"{c} -X- _ {char_label}") + + return char_lines + +if __name__ == "__main__": + import sys + if len(sys.argv) < 3: + print("Usage: python word_conll_to_char_conll.py ") + sys.exit(1) + + input_fp = sys.argv[1] + output_fp = sys.argv[2] + + with open(input_fp, "r", encoding="utf-8") as f: + word_conll_lines = f.readlines() + + char_conll_lines = word_conll_to_char_conll(word_conll_lines) + + with open(output_fp, "w", encoding="utf-8") as f: + f.write("\n".join(char_conll_lines) + "\n") + + print(f"Converted {input_fp} to character-level CoNLL format at {output_fp}") \ No newline at end of file -- cgit v1.2.3-70-g09d2