Merge pull request #6 from pineoncellar/debug

feat: 🎨 More standardized log parse & Add dependency json5
author: 简律纯 <i@jyunko.cn> 2025-03-15 21:53:01 +0800
committer: GitHub <noreply@github.com> 2025-03-15 21:53:01 +0800
commit: 5319feea52f7266029b9a3a609a3f1ae494c6a60 (patch)
tree: 33ea3be55c3c102f5975a1f2796cf6e34062d026
parent: 965771fb0d85ddb27dc6c5dd7df822d1fb318286 (diff)
parent: 5f01c1710d4b6ae1e0ce9fba10f1528711a2f63f (diff)
download: conventional_role_play-5319feea52f7266029b9a3a609a3f1ae494c6a60.tar.gz
conventional_role_play-5319feea52f7266029b9a3a609a3f1ae494c6a60.zip
5 files changed, 145 insertions, 29 deletions
diff --git a/pyproject.toml b/pyproject.toml
index 37f1421..78a737a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -101,6 +101,7 @@ line-ending = "auto"
 dev = [
     "docutils>=0.21.2",
     "furo>=2024.8.6",
+    "json5>=0.10.0",
     "myst-parser>=3.0.1",
     "ruff>=0.9.6",
     "sphinx>=7.4.7",
diff --git a/src/conventionalrp/core/parser.py b/src/conventionalrp/core/parser.py
index d5b91da..f451178 100644
--- a/src/conventionalrp/core/parser.py
+++ b/src/conventionalrp/core/parser.py
@@ -1,4 +1,4 @@
-import json
+import json5
 import re
 from pathlib import Path
 
@@ -15,7 +15,7 @@ class Parser:
         with open(rules_path, "r", encoding="utf-8") as f:
             file_content = f.read()
 
-        rules = json.loads(file_content)
+        rules = json5.loads(file_content)
 
         # validation rule format
         if rules is None:
@@ -34,24 +34,88 @@ class Parser:
         with open(log_path, "r", encoding="utf-8") as f:
             log_content = f.read().splitlines()
 
+        current_metadata = None
+        current_content = []
+
         # Iterate each line of the log
         for line in log_content:
             # pass blank line
             if not line.strip():
                 continue
 
-            # try to match the current line by rules
+            # metadata detect
+            is_metadata = False
             for rule in self.rules:
-                pattern = rule.get("pattern")
-                rule_type = rule.get("type")
-                match = re.search(pattern, line)
-                if match:
-                    # matched
-                    content = match.group(1).strip()
-                    parsed_data.append({"content": content, "type": rule_type})
-                    break
-            # no matched, marked as an unknown type
-            else:
-                parsed_data.append({"content": line.strip(), "type": "unknown"})
-
-        return parsed_data
+                if rule.get("type") == "metadata":
+                    patterns = rule.get("patterns", [])
+                    for pattern in patterns:
+                        match = re.search(pattern, line)
+                        if match:
+                            # If it's metadata, save the previous content
+                            if current_metadata:
+                                parsed_data.append({
+                                    **current_metadata,
+                                    "content": current_content
+                                })
+                                current_content = []
+
+                            # Parsing new metadata
+                            current_metadata = {}
+                            groups = rule.get("groups", [])
+                            for i, key in enumerate(groups):
+                                if i + 1 <= len(match.groups()):  # Ensure effective
+                                    current_metadata[key] = match.group(i + 1).strip()
+                            is_metadata = True
+                            break
+                    if is_metadata:
+                        break
+
+            if is_metadata:
+                continue  # The metadata line has been processed, skip subsequent content matching
+
+            # content detect
+            remaining_line = line
+            while remaining_line:
+                matched = False
+                for rule in self.rules:
+                    # pass metadata rule
+                    if rule["type"] == "metadata":
+                        continue
+
+                    for pattern in rule["patterns"]:
+                        match = re.match(pattern, remaining_line)
+                        if match:
+                            # If the matching content is not the beginning, it means that there is unknown content in front of it
+                            if match.start() > 0:
+                                current_content.append({
+                                    "type": "unknown",
+                                    "content": remaining_line[:match.start()]
+                                })
+                            
+                            # Extract matched content
+                            entry = {"type": rule["type"], "content": match.group(0)}
+                            for i, group in enumerate(rule["groups"]):
+                                entry[group] = match.group(i+1).strip() if match.group(i+1) else ""
+                            
+                            current_content.append(entry)
+                            remaining_line = remaining_line[match.end():].lstrip()
+                            matched = True
+                            break
+                    if matched:
+                        break
+                
+                if not matched:
+                    current_content.append({
+                        "type": "unknown",
+                        "content": remaining_line
+                    })
+                    remaining_line = ""
+
+        # Process the last line
+        if current_metadata:
+            parsed_data.append({
+                **current_metadata,
+                "content": current_content
+            })
+
+        return parsed_data
+\ No newline at end of file
diff --git a/test/example_log.log b/test/example_log.log
index aff1d3a..b7e8597 100644
--- a/test/example_log.log
+++ b/test/example_log.log
@@ -10,6 +10,12 @@ MIKU(2754533655) 2025-01-27 19:58:39
 以实玛利(1316702392) 2025-01-27 19:58:42
 “很高兴认识你，我是以实玛利”
 
+麦奎恩·马瑟斯(602380092) 2025-01-27 20:05:18
+“你好，”#向前伸出手，“鄙人马瑟斯”
+
+麦奎恩·马瑟斯(602380092) 2025-01-27 20:05:18
+【这人不简单啊】#和对方握手，“幸会幸会”
+
 以实玛利(1316702392) 2025-01-27 20:00:02
 （白师傅，能不能别念了）
 
@@ -32,4 +38,4 @@ MIKU(2754533655) 2025-01-27 20:02:06
 也站起身子探头探脑的看了看办公室
 
 MIKU(2754533655) 2025-01-27 20:02:18
-可以注意到瑞德曼的办公桌上有一份卷宗，卷宗上写着“4·22 袭警案调查报告”的字样：
-\ No newline at end of file
+可以注意到瑞德曼的办公桌上有一份卷宗，卷宗上写着“4·22袭警案调查报告”的字样：
+\ No newline at end of file
diff --git a/test/example_rule.json b/test/example_rule.json
index d385f38..0cb5b6c 100644
--- a/test/example_rule.json
+++ b/test/example_rule.json
@@ -1,32 +1,66 @@
 [
     {
-        "pattern": "^(\\S+)\\((\\d+)\\)\\s+(\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2})",
+        // 匹配日志元数据，提取 id、QQ 账号和时间。例如：墨勒托.DW(1571806261) 2025-01-27 19:58:15
         "type": "metadata",
-        "description": "匹配日志元数据，提取 id、QQ 账号和时间。例如：墨勒托.DW(1571806261) 2025-01-27 19:58:15"
+        "patterns": [
+            "^(\\S+)\\((\\d+)\\)\\s+(\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2})"
+        ],
+        "groups": [
+            "user_name",
+            "user_id",
+            "time"
+        ]
     },
     {
-        "pattern": "^#(.+)",
         "type": "action",
-        "description": "匹配行动，以 # 开头。例如：# 我推开门"
+        "patterns": [
+            "^#s*((?:(?![“”\"（【】]).)+)"
+        ], // 排除后续特殊符号
+        "groups": [
+            "action_content"
+        ]
     },
     {
-        "pattern": "“(.+)”",
         "type": "speech",
-        "description": "匹配玩家发言，双引号内的内容。例如：\"你好，我是冒险者\""
+        "patterns": [
+            "[“](.+?)[”]", // 中文引号
+            "\"(.*?)\"", // 英文引号
+            "”(.+?)“" // 混合引号
+        ],
+        "groups": [
+            "speech_content"
+        ]
     },
     {
-        "pattern": "\\((.+)\\)",
         "type": "ooc_speech",
-        "description": "匹配场外发言，括号内的内容。例如：(今天没时间跑团)"
+        "patterns": [
+            // "((.*?))", // 英文括号
+            "（(.*?)）", // 中文括号
+            // "((.*)", // 未闭合英文括号
+            "（(.*)" // 未闭合中文括号
+        ],
+        "groups": [
+            "ooc_content"
+        ]
     },
     {
-        "pattern": "^(?:[\\.。]([^.。].+))",
+        // 匹配掷骰指令，以 . 或 。开头但是不匹配连续的指令前缀。例如：匹配".ra智力"，不匹配"。。。"
         "type": "dice_order",
-        "description": "匹配掷骰指令，以 . 或 。 开头但是不匹配连续的指令前缀。例如：匹配.ra智力，不匹配'。。。'"
+        "patterns": [
+            "^(?:[\\.。]([^.。].+))"
+        ],
+        "groups": [
+            "dice_command"
+        ]
     },
     {
-        "pattern": "【(.+)】",
+        // 匹配角色心理活动。例如：【这里好可怕】
         "type": "thought",
-        "description": "匹配角色心理活动。例如：【这里好可怕】"
+        "patterns": [
+            "【(.+)】"
+        ],
+        "groups": [
+            "thought_content"
+        ]
     }
 ]
 \ No newline at end of file
diff --git a/uv.lock b/uv.lock
index 72093e4..e38f683 100644
--- a/uv.lock
+++ b/uv.lock
@@ -179,6 +179,7 @@ source = { editable = "." }
 dev = [
     { name = "docutils" },
     { name = "furo" },
+    { name = "json5" },
     { name = "myst-parser", version = "3.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
     { name = "myst-parser", version = "4.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
     { name = "ruff" },
@@ -196,6 +197,7 @@ dev = [
 dev = [
     { name = "docutils", specifier = ">=0.21.2" },
     { name = "furo", specifier = ">=2024.8.6" },
+    { name = "json5", specifier = ">=0.10.0" },
     { name = "myst-parser", specifier = ">=3.0.1" },
     { name = "ruff", specifier = ">=0.9.6" },
     { name = "sphinx", specifier = ">=7.4.7" },
@@ -291,6 +293,15 @@ wheels = [
 ]
 
 [[package]]
+name = "json5"
+version = "0.10.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/85/3d/bbe62f3d0c05a689c711cff57b2e3ac3d3e526380adb7c781989f075115c/json5-0.10.0.tar.gz", hash = "sha256:e66941c8f0a02026943c52c2eb34ebeb2a6f819a0be05920a6f5243cd30fd559", size = 48202 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/aa/42/797895b952b682c3dafe23b1834507ee7f02f4d6299b65aaa61425763278/json5-0.10.0-py3-none-any.whl", hash = "sha256:19b23410220a7271e8377f81ba8aacba2fdd56947fbb137ee5977cbe1f5e8dfa", size = 34049 },
+]
+
+[[package]]
 name = "markdown-it-py"
 version = "3.0.0"
 source = { registry = "https://pypi.org/simple" }
author	简律纯 <i@jyunko.cn>	2025-03-15 21:53:01 +0800
committer	GitHub <noreply@github.com>	2025-03-15 21:53:01 +0800
commit	5319feea52f7266029b9a3a609a3f1ae494c6a60 (patch)
tree	33ea3be55c3c102f5975a1f2796cf6e34062d026
parent	965771fb0d85ddb27dc6c5dd7df822d1fb318286 (diff)
parent	5f01c1710d4b6ae1e0ce9fba10f1528711a2f63f (diff)
download	conventional_role_play-5319feea52f7266029b9a3a609a3f1ae494c6a60.tar.gz conventional_role_play-5319feea52f7266029b9a3a609a3f1ae494c6a60.zip