diff options
| author | 2025-10-29 18:47:51 +0800 | |
|---|---|---|
| committer | 2025-10-29 18:47:51 +0800 | |
| commit | dcaf3a04f343f9f01191056fff9eb43f04d2b45f (patch) | |
| tree | ee8725a900c8800747dafeac8b25453be66fb463 /src/conventionalrp | |
| parent | 2cdb39569c1ba967e04ee604b7a9610055c0af77 (diff) | |
| download | conventional_role_play-dcaf3a04f343f9f01191056fff9eb43f04d2b45f.tar.gz conventional_role_play-dcaf3a04f343f9f01191056fff9eb43f04d2b45f.zip | |
refactor: streamline THULACParser documentation and remove redundant comments
Diffstat (limited to 'src/conventionalrp')
| -rw-r--r-- | src/conventionalrp/core/thulac_parser.py | 29 |
1 files changed, 4 insertions, 25 deletions
diff --git a/src/conventionalrp/core/thulac_parser.py b/src/conventionalrp/core/thulac_parser.py index 075c3bd..d4f7706 100644 --- a/src/conventionalrp/core/thulac_parser.py +++ b/src/conventionalrp/core/thulac_parser.py @@ -30,19 +30,10 @@ try: except ImportError: THULAC_AVAILABLE = False logger.warning("THULAC not installed. Please install with: pip install thulac") + class THULACParser: - """ - 基于 THULAC 的智能 TRPG 日志解析器 - - 特性: - 1. 使用 THULAC 进行中文分词和词性标注 - 2. 基于词性和上下文自动识别内容类型 - 3. 极简规则配置(只需配置分隔符) - 4. 支持自定义词典扩展 - """ - # 默认分隔符配置(可通过 load_rules 覆盖) DEFAULT_DELIMITERS = { "dialogue": [ @@ -66,7 +57,6 @@ class THULACParser: ] } - # THULAC 词性到内容类型的映射 POS_TYPE_MAPPING = { # 动词相关 -> 动作 "v": "action", # 动词 @@ -158,6 +148,7 @@ class THULACParser: rules_dict: 直接传入规则字典 规则格式示例: + ```json { "delimiters": { "dialogue": [["\"", "\""], [""", """]], @@ -170,6 +161,7 @@ class THULACParser: "守秘人": "np" } } + ``` """ import json5 from pathlib import Path @@ -227,7 +219,6 @@ class THULACParser: "confidence": 1.0 # 分隔符匹配的置信度为 100% }) - # 按位置排序 results.sort(key=lambda x: x["start"]) return results @@ -238,7 +229,6 @@ class THULACParser: Returns: List of {type, content, words, tags, confidence} """ - # 使用 THULAC 进行分词和词性标注 result = self.thulac.cut(text, text=False) # 返回 [(word, pos), ...] if not result: @@ -262,8 +252,6 @@ class THULACParser: # 基于词性和内容推断类型 content_type = self._infer_content_type(words, tags) - - # 计算置信度 confidence = self._calculate_confidence(words, tags, content_type) return [{ @@ -284,17 +272,14 @@ class THULACParser: 2. 检查是否包含对话指示词 -> dialogue 3. 统计主导词性 -> 按映射表判断 """ - # 策略1: 检查动作动词 for word in words: if word in self.ACTION_VERBS: return "action" - # 策略2: 检查对话指示词 dialogue_indicators = sum(1 for w in words if w in self.DIALOGUE_INDICATORS) if dialogue_indicators >= 2: # 至少2个对话指示词 return "dialogue" - # 策略3: 统计词性 pos_count = {} for tag in tags: if tag == "w": # 忽略标点 @@ -329,7 +314,6 @@ class THULACParser: base_confidence = 0.5 - # 因素1: 关键词匹配 if content_type == "action": action_word_count = sum(1 for w in words if w in self.ACTION_VERBS) if action_word_count > 0: @@ -338,8 +322,7 @@ class THULACParser: dialogue_word_count = sum(1 for w in words if w in self.DIALOGUE_INDICATORS) if dialogue_word_count >= 2: base_confidence += 0.3 - - # 因素2: 词性一致性 + unique_pos = len(set(tag for tag in tags if tag != "w")) if unique_pos == 1: base_confidence += 0.2 @@ -361,13 +344,11 @@ class THULACParser: results = [] covered_ranges = set() - # 首先添加所有分隔符标记的内容 for item in delimited: results.append(item) for i in range(item["start"], item["end"]): covered_ranges.add(i) - # 然后分析未被覆盖的文本片段 uncovered_segments = [] start = 0 for i in range(len(text)): @@ -378,7 +359,6 @@ class THULACParser: if start < len(text): uncovered_segments.append((start, len(text))) - # 对未覆盖的片段使用 THULAC 分析 for start, end in uncovered_segments: segment = text[start:end].strip() if segment: @@ -388,7 +368,6 @@ class THULACParser: item["end"] = end results.append(item) - # 按位置重新排序 results.sort(key=lambda x: x.get("start", 0)) return results |
