From 575114661ef9afb95df2a211e1d8498686340e6b Mon Sep 17 00:00:00 2001
From: HsiangNianian <i@jyunko.cn>
Date: Tue, 30 Dec 2025 19:54:08 +0800
Subject: feat: Refactor and enhance TRPG NER model SDK

- Removed deprecated `word_conll_to_char_conll.py` utility and integrated its functionality into the new `utils` module.
- Introduced a comprehensive GitHub Actions workflow for automated publishing to PyPI and GitHub Releases.
- Added `__init__.py` files to establish package structure for `basemodel`, `inference`, `training`, and `utils` modules.
- Implemented model downloading functionality in `download_model.py` to fetch pre-trained ONNX models.
- Developed `TRPGParser` class for ONNX-based inference, including methods for parsing TRPG logs.
- Created training utilities in `training/__init__.py` for NER model training with Hugging Face Transformers.
- Enhanced utility functions for CoNLL file parsing and dataset creation.
- Added command-line interface for converting CoNLL files to datasets with validation options.
---
 src/utils/word_conll_to_char_conll.py | 55 -----------------------------------
 1 file changed, 55 deletions(-)
 delete mode 100644 src/utils/word_conll_to_char_conll.py

(limited to 'src/utils/word_conll_to_char_conll.py')

diff --git a/src/utils/word_conll_to_char_conll.py b/src/utils/word_conll_to_char_conll.py
deleted file mode 100644
index e52405f..0000000
--- a/src/utils/word_conll_to_char_conll.py
+++ /dev/null
@@ -1,55 +0,0 @@
-def word_conll_to_char_conll(word_conll_lines: list[str]) -> list[str]:
-    char_lines = []
-    in_new_sample = True  # 下一行是否应视为新样本开始
-
-    for line in word_conll_lines:
-        stripped = line.strip()
-        if not stripped:
-            # 空行 → 标记下一句为新样本
-            in_new_sample = True
-            char_lines.append("")
-            continue
-
-        parts = stripped.split()
-        if len(parts) < 4:
-            char_lines.append(line.rstrip())
-            continue
-
-        token, label = parts[0], parts[3]
-
-        # 检测新发言：B-speaker 出现 → 新样本
-        if label == "B-speaker" and in_new_sample:
-            char_lines.append("-DOCSTART- -X- O")
-            in_new_sample = False
-
-        # 转换 token → char labels（同前）
-        if label == "O":
-            for c in token:
-                char_lines.append(f"{c} -X- _ O")
-        else:
-            bio_prefix = label[:2]
-            tag = label[2:]
-            for i, c in enumerate(token):
-                char_label = f"B-{tag}" if (bio_prefix == "B-" and i == 0) else f"I-{tag}"
-                char_lines.append(f"{c} -X- _ {char_label}")
-
-    return char_lines
-
-if __name__ == "__main__":
-    import sys
-    if len(sys.argv) < 3:
-        print("Usage: python word_conll_to_char_conll.py <input_word.conll> <output_char.conll>")
-        sys.exit(1)
-
-    input_fp = sys.argv[1]
-    output_fp = sys.argv[2]
-
-    with open(input_fp, "r", encoding="utf-8") as f:
-        word_conll_lines = f.readlines()
-
-    char_conll_lines = word_conll_to_char_conll(word_conll_lines)
-
-    with open(output_fp, "w", encoding="utf-8") as f:
-        f.write("\n".join(char_conll_lines) + "\n")
-
-    print(f"Converted {input_fp} to character-level CoNLL format at {output_fp}")
\ No newline at end of file
-- 
cgit v1.2.3-70-g09d2