From f4f9c541e9917fa614e6e1b8e737167f44c89c43 Mon Sep 17 00:00:00 2001 From: HsiangNianian Date: Sun, 4 Jan 2026 15:59:54 +0800 Subject: feat: add log processing and LLM annotation functionality --- utils/process_log.py | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 utils/process_log.py (limited to 'utils/process_log.py') diff --git a/utils/process_log.py b/utils/process_log.py new file mode 100644 index 0000000..fffc359 --- /dev/null +++ b/utils/process_log.py @@ -0,0 +1,53 @@ +import glob +import json +import re + + +def process_g_files(): + files = glob.glob("g*.txt") + + if not files: + print("未找到以'g'开头的txt文件") + return + + print(f"找到 {len(files)} 个文件: {', '.join(files)}") + + all_entries = [] + + for file_path in files: + try: + with open(file_path, "r", encoding="utf-8") as file: + current_paragraph = [] + + for line in file: + stripped_line = line.rstrip("\n") + + if stripped_line.strip(): + current_paragraph.append(stripped_line) + else: + if current_paragraph: + paragraph_text = "\n".join(current_paragraph) + cleaned_text = re.sub(r"\(\d+\)", "", paragraph_text) + all_entries.append({"text": cleaned_text}) + current_paragraph = [] + + if current_paragraph: + paragraph_text = "\n".join(current_paragraph) + cleaned_text = re.sub(r"\(\d+\)", "", paragraph_text) + all_entries.append({"text": cleaned_text}) + + print(f"处理文件 {file_path} 完成") + + except Exception as e: + print(f"处理文件 {file_path} 时出错: {e}") + + output_file = "processed_logs.json" + with open(output_file, "w", encoding="utf-8") as f: + json.dump(all_entries, f, ensure_ascii=False, indent=2) + + print(f"\n处理完成! 共处理 {len(all_entries)} 个段落") + print(f"结果已保存到 {output_file}") + + +if __name__ == "__main__": + process_g_files() -- cgit v1.2.3-70-g09d2