From 7ac684f1f82023c6284cd7d7efde11b8dc98c149 Mon Sep 17 00:00:00 2001
From: HsiangNianian <i@jyunko.cn>
Date: Tue, 30 Dec 2025 19:14:39 +0800
Subject: feat: Implement TRPG NER training and inference script with robust
 model path detection and enhanced timestamp/speaker handling

- Added main training and inference logic in main.py, including CoNLL parsing, tokenization, and model training.
- Introduced TRPGParser class for inference with entity aggregation and special handling for timestamps and speakers.
- Developed utility functions for converting word-level CoNLL to char-level and saving datasets in various formats.
- Added ONNX export functionality for the trained model.
- Created a comprehensive requirements.txt and updated pyproject.toml with necessary dependencies.
- Implemented tests for ONNX inference to validate model outputs.
---
 src/utils/word_conll_to_char_conll.py | 55 +++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 src/utils/word_conll_to_char_conll.py

(limited to 'src/utils/word_conll_to_char_conll.py')

diff --git a/src/utils/word_conll_to_char_conll.py b/src/utils/word_conll_to_char_conll.py
new file mode 100644
index 0000000..e52405f
--- /dev/null
+++ b/src/utils/word_conll_to_char_conll.py
@@ -0,0 +1,55 @@
+def word_conll_to_char_conll(word_conll_lines: list[str]) -> list[str]:
+    char_lines = []
+    in_new_sample = True  # 下一行是否应视为新样本开始
+
+    for line in word_conll_lines:
+        stripped = line.strip()
+        if not stripped:
+            # 空行 → 标记下一句为新样本
+            in_new_sample = True
+            char_lines.append("")
+            continue
+
+        parts = stripped.split()
+        if len(parts) < 4:
+            char_lines.append(line.rstrip())
+            continue
+
+        token, label = parts[0], parts[3]
+
+        # 检测新发言：B-speaker 出现 → 新样本
+        if label == "B-speaker" and in_new_sample:
+            char_lines.append("-DOCSTART- -X- O")
+            in_new_sample = False
+
+        # 转换 token → char labels（同前）
+        if label == "O":
+            for c in token:
+                char_lines.append(f"{c} -X- _ O")
+        else:
+            bio_prefix = label[:2]
+            tag = label[2:]
+            for i, c in enumerate(token):
+                char_label = f"B-{tag}" if (bio_prefix == "B-" and i == 0) else f"I-{tag}"
+                char_lines.append(f"{c} -X- _ {char_label}")
+
+    return char_lines
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) < 3:
+        print("Usage: python word_conll_to_char_conll.py <input_word.conll> <output_char.conll>")
+        sys.exit(1)
+
+    input_fp = sys.argv[1]
+    output_fp = sys.argv[2]
+
+    with open(input_fp, "r", encoding="utf-8") as f:
+        word_conll_lines = f.readlines()
+
+    char_conll_lines = word_conll_to_char_conll(word_conll_lines)
+
+    with open(output_fp, "w", encoding="utf-8") as f:
+        f.write("\n".join(char_conll_lines) + "\n")
+
+    print(f"Converted {input_fp} to character-level CoNLL format at {output_fp}")
\ No newline at end of file
-- 
cgit v1.2.3-70-g09d2