From 898db38f66f2b6b0047df75eaf3ced0d64cda664 Mon Sep 17 00:00:00 2001 From: pine Date: Sat, 15 Mar 2025 17:54:05 +0800 Subject: feat: ➕ Add dependency json5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 1 + src/conventionalrp/core/parser.py | 6 +++--- uv.lock | 11 +++++++++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 37f1421..78a737a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,6 +101,7 @@ line-ending = "auto" dev = [ "docutils>=0.21.2", "furo>=2024.8.6", + "json5>=0.10.0", "myst-parser>=3.0.1", "ruff>=0.9.6", "sphinx>=7.4.7", diff --git a/src/conventionalrp/core/parser.py b/src/conventionalrp/core/parser.py index d5b91da..4d9b975 100644 --- a/src/conventionalrp/core/parser.py +++ b/src/conventionalrp/core/parser.py @@ -1,4 +1,4 @@ -import json +import json5 import re from pathlib import Path @@ -15,7 +15,7 @@ class Parser: with open(rules_path, "r", encoding="utf-8") as f: file_content = f.read() - rules = json.loads(file_content) + rules = json5.loads(file_content) # validation rule format if rules is None: @@ -54,4 +54,4 @@ class Parser: else: parsed_data.append({"content": line.strip(), "type": "unknown"}) - return parsed_data + return parsed_data \ No newline at end of file diff --git a/uv.lock b/uv.lock index 72093e4..e38f683 100644 --- a/uv.lock +++ b/uv.lock @@ -179,6 +179,7 @@ source = { editable = "." } dev = [ { name = "docutils" }, { name = "furo" }, + { name = "json5" }, { name = "myst-parser", version = "3.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "myst-parser", version = "4.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "ruff" }, @@ -196,6 +197,7 @@ dev = [ dev = [ { name = "docutils", specifier = ">=0.21.2" }, { name = "furo", specifier = ">=2024.8.6" }, + { name = "json5", specifier = ">=0.10.0" }, { name = "myst-parser", specifier = ">=3.0.1" }, { name = "ruff", specifier = ">=0.9.6" }, { name = "sphinx", specifier = ">=7.4.7" }, @@ -290,6 +292,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bd/0f/2ba5fbcd631e3e88689309dbe978c5769e883e4b84ebfe7da30b43275c5a/jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb", size = 134596 }, ] +[[package]] +name = "json5" +version = "0.10.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/85/3d/bbe62f3d0c05a689c711cff57b2e3ac3d3e526380adb7c781989f075115c/json5-0.10.0.tar.gz", hash = "sha256:e66941c8f0a02026943c52c2eb34ebeb2a6f819a0be05920a6f5243cd30fd559", size = 48202 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/aa/42/797895b952b682c3dafe23b1834507ee7f02f4d6299b65aaa61425763278/json5-0.10.0-py3-none-any.whl", hash = "sha256:19b23410220a7271e8377f81ba8aacba2fdd56947fbb137ee5977cbe1f5e8dfa", size = 34049 }, +] + [[package]] name = "markdown-it-py" version = "3.0.0" -- cgit v1.2.3-70-g09d2 From 421dd2a20c82339392359ff7302f09e469a0c27c Mon Sep 17 00:00:00 2001 From: pine Date: Sat, 15 Mar 2025 17:54:57 +0800 Subject: chore: 📝 Update example_log.log & example_rule.json MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/example_log.log | 8 ++++++- test/example_rule.json | 58 +++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 53 insertions(+), 13 deletions(-) diff --git a/test/example_log.log b/test/example_log.log index aff1d3a..b7e8597 100644 --- a/test/example_log.log +++ b/test/example_log.log @@ -10,6 +10,12 @@ MIKU(2754533655) 2025-01-27 19:58:39 以实玛利(1316702392) 2025-01-27 19:58:42 “很高兴认识你,我是以实玛利” +麦奎恩·马瑟斯(602380092) 2025-01-27 20:05:18 +“你好,”#向前伸出手,“鄙人马瑟斯” + +麦奎恩·马瑟斯(602380092) 2025-01-27 20:05:18 +【这人不简单啊】#和对方握手,“幸会幸会” + 以实玛利(1316702392) 2025-01-27 20:00:02 (白师傅,能不能别念了) @@ -32,4 +38,4 @@ MIKU(2754533655) 2025-01-27 20:02:06 也站起身子探头探脑的看了看办公室 MIKU(2754533655) 2025-01-27 20:02:18 -可以注意到瑞德曼的办公桌上有一份卷宗,卷宗上写着“4·22 袭警案调查报告”的字样: \ No newline at end of file +可以注意到瑞德曼的办公桌上有一份卷宗,卷宗上写着“4·22袭警案调查报告”的字样: \ No newline at end of file diff --git a/test/example_rule.json b/test/example_rule.json index d385f38..0cb5b6c 100644 --- a/test/example_rule.json +++ b/test/example_rule.json @@ -1,32 +1,66 @@ [ { - "pattern": "^(\\S+)\\((\\d+)\\)\\s+(\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2})", + // 匹配日志元数据,提取 id、QQ 账号和时间。例如:墨勒托.DW(1571806261) 2025-01-27 19:58:15 "type": "metadata", - "description": "匹配日志元数据,提取 id、QQ 账号和时间。例如:墨勒托.DW(1571806261) 2025-01-27 19:58:15" + "patterns": [ + "^(\\S+)\\((\\d+)\\)\\s+(\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2})" + ], + "groups": [ + "user_name", + "user_id", + "time" + ] }, { - "pattern": "^#(.+)", "type": "action", - "description": "匹配行动,以 # 开头。例如:# 我推开门" + "patterns": [ + "^#s*((?:(?![“”\"(【】]).)+)" + ], // 排除后续特殊符号 + "groups": [ + "action_content" + ] }, { - "pattern": "“(.+)”", "type": "speech", - "description": "匹配玩家发言,双引号内的内容。例如:\"你好,我是冒险者\"" + "patterns": [ + "[“](.+?)[”]", // 中文引号 + "\"(.*?)\"", // 英文引号 + "”(.+?)“" // 混合引号 + ], + "groups": [ + "speech_content" + ] }, { - "pattern": "\\((.+)\\)", "type": "ooc_speech", - "description": "匹配场外发言,括号内的内容。例如:(今天没时间跑团)" + "patterns": [ + // "((.*?))", // 英文括号 + "((.*?))", // 中文括号 + // "((.*)", // 未闭合英文括号 + "((.*)" // 未闭合中文括号 + ], + "groups": [ + "ooc_content" + ] }, { - "pattern": "^(?:[\\.。]([^.。].+))", + // 匹配掷骰指令,以 . 或 。开头但是不匹配连续的指令前缀。例如:匹配".ra智力",不匹配"。。。" "type": "dice_order", - "description": "匹配掷骰指令,以 . 或 。 开头但是不匹配连续的指令前缀。例如:匹配.ra智力,不匹配'。。。'" + "patterns": [ + "^(?:[\\.。]([^.。].+))" + ], + "groups": [ + "dice_command" + ] }, { - "pattern": "【(.+)】", + // 匹配角色心理活动。例如:【这里好可怕】 "type": "thought", - "description": "匹配角色心理活动。例如:【这里好可怕】" + "patterns": [ + "【(.+)】" + ], + "groups": [ + "thought_content" + ] } ] \ No newline at end of file -- cgit v1.2.3-70-g09d2 From 5f01c1710d4b6ae1e0ce9fba10f1528711a2f63f Mon Sep 17 00:00:00 2001 From: pine Date: Sat, 15 Mar 2025 17:58:19 +0800 Subject: feat: 🎨 More standardized log parse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/conventionalrp/core/parser.py | 88 +++++++++++++++++++++++++++++++++------ 1 file changed, 76 insertions(+), 12 deletions(-) diff --git a/src/conventionalrp/core/parser.py b/src/conventionalrp/core/parser.py index 4d9b975..f451178 100644 --- a/src/conventionalrp/core/parser.py +++ b/src/conventionalrp/core/parser.py @@ -34,24 +34,88 @@ class Parser: with open(log_path, "r", encoding="utf-8") as f: log_content = f.read().splitlines() + current_metadata = None + current_content = [] + # Iterate each line of the log for line in log_content: # pass blank line if not line.strip(): continue - # try to match the current line by rules + # metadata detect + is_metadata = False for rule in self.rules: - pattern = rule.get("pattern") - rule_type = rule.get("type") - match = re.search(pattern, line) - if match: - # matched - content = match.group(1).strip() - parsed_data.append({"content": content, "type": rule_type}) - break - # no matched, marked as an unknown type - else: - parsed_data.append({"content": line.strip(), "type": "unknown"}) + if rule.get("type") == "metadata": + patterns = rule.get("patterns", []) + for pattern in patterns: + match = re.search(pattern, line) + if match: + # If it's metadata, save the previous content + if current_metadata: + parsed_data.append({ + **current_metadata, + "content": current_content + }) + current_content = [] + + # Parsing new metadata + current_metadata = {} + groups = rule.get("groups", []) + for i, key in enumerate(groups): + if i + 1 <= len(match.groups()): # Ensure effective + current_metadata[key] = match.group(i + 1).strip() + is_metadata = True + break + if is_metadata: + break + + if is_metadata: + continue # The metadata line has been processed, skip subsequent content matching + + # content detect + remaining_line = line + while remaining_line: + matched = False + for rule in self.rules: + # pass metadata rule + if rule["type"] == "metadata": + continue + + for pattern in rule["patterns"]: + match = re.match(pattern, remaining_line) + if match: + # If the matching content is not the beginning, it means that there is unknown content in front of it + if match.start() > 0: + current_content.append({ + "type": "unknown", + "content": remaining_line[:match.start()] + }) + + # Extract matched content + entry = {"type": rule["type"], "content": match.group(0)} + for i, group in enumerate(rule["groups"]): + entry[group] = match.group(i+1).strip() if match.group(i+1) else "" + + current_content.append(entry) + remaining_line = remaining_line[match.end():].lstrip() + matched = True + break + if matched: + break + + if not matched: + current_content.append({ + "type": "unknown", + "content": remaining_line + }) + remaining_line = "" + + # Process the last line + if current_metadata: + parsed_data.append({ + **current_metadata, + "content": current_content + }) return parsed_data \ No newline at end of file -- cgit v1.2.3-70-g09d2