aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/src/utils/word_conll_to_char_conll.py
diff options
context:
space:
mode:
authorHsiangNianian <i@jyunko.cn>2025-12-30 19:14:39 +0800
committerHsiangNianian <i@jyunko.cn>2025-12-30 19:14:39 +0800
commit7ac684f1f82023c6284cd7d7efde11b8dc98c149 (patch)
tree4ac4e9fb72a4e1e2578d9fb4e9704967b052ec15 /src/utils/word_conll_to_char_conll.py
parent12910f3a937633a25aa0de463a6edf756f2b8cdd (diff)
downloadbase-model-7ac684f1f82023c6284cd7d7efde11b8dc98c149.tar.gz
base-model-7ac684f1f82023c6284cd7d7efde11b8dc98c149.zip
feat: Implement TRPG NER training and inference script with robust model path detection and enhanced timestamp/speaker handling
- Added main training and inference logic in main.py, including CoNLL parsing, tokenization, and model training. - Introduced TRPGParser class for inference with entity aggregation and special handling for timestamps and speakers. - Developed utility functions for converting word-level CoNLL to char-level and saving datasets in various formats. - Added ONNX export functionality for the trained model. - Created a comprehensive requirements.txt and updated pyproject.toml with necessary dependencies. - Implemented tests for ONNX inference to validate model outputs.
Diffstat (limited to 'src/utils/word_conll_to_char_conll.py')
-rw-r--r--src/utils/word_conll_to_char_conll.py55
1 files changed, 55 insertions, 0 deletions
diff --git a/src/utils/word_conll_to_char_conll.py b/src/utils/word_conll_to_char_conll.py
new file mode 100644
index 0000000..e52405f
--- /dev/null
+++ b/src/utils/word_conll_to_char_conll.py
@@ -0,0 +1,55 @@
+def word_conll_to_char_conll(word_conll_lines: list[str]) -> list[str]:
+ char_lines = []
+ in_new_sample = True # 下一行是否应视为新样本开始
+
+ for line in word_conll_lines:
+ stripped = line.strip()
+ if not stripped:
+ # 空行 → 标记下一句为新样本
+ in_new_sample = True
+ char_lines.append("")
+ continue
+
+ parts = stripped.split()
+ if len(parts) < 4:
+ char_lines.append(line.rstrip())
+ continue
+
+ token, label = parts[0], parts[3]
+
+ # 检测新发言:B-speaker 出现 → 新样本
+ if label == "B-speaker" and in_new_sample:
+ char_lines.append("-DOCSTART- -X- O")
+ in_new_sample = False
+
+ # 转换 token → char labels(同前)
+ if label == "O":
+ for c in token:
+ char_lines.append(f"{c} -X- _ O")
+ else:
+ bio_prefix = label[:2]
+ tag = label[2:]
+ for i, c in enumerate(token):
+ char_label = f"B-{tag}" if (bio_prefix == "B-" and i == 0) else f"I-{tag}"
+ char_lines.append(f"{c} -X- _ {char_label}")
+
+ return char_lines
+
+if __name__ == "__main__":
+ import sys
+ if len(sys.argv) < 3:
+ print("Usage: python word_conll_to_char_conll.py <input_word.conll> <output_char.conll>")
+ sys.exit(1)
+
+ input_fp = sys.argv[1]
+ output_fp = sys.argv[2]
+
+ with open(input_fp, "r", encoding="utf-8") as f:
+ word_conll_lines = f.readlines()
+
+ char_conll_lines = word_conll_to_char_conll(word_conll_lines)
+
+ with open(output_fp, "w", encoding="utf-8") as f:
+ f.write("\n".join(char_conll_lines) + "\n")
+
+ print(f"Converted {input_fp} to character-level CoNLL format at {output_fp}") \ No newline at end of file