summaryrefslogtreecommitdiffstatshomepage
path: root/utils/process_log.py
diff options
context:
space:
mode:
Diffstat (limited to 'utils/process_log.py')
-rw-r--r--utils/process_log.py53
1 files changed, 53 insertions, 0 deletions
diff --git a/utils/process_log.py b/utils/process_log.py
new file mode 100644
index 0000000..fffc359
--- /dev/null
+++ b/utils/process_log.py
@@ -0,0 +1,53 @@
+import glob
+import json
+import re
+
+
+def process_g_files():
+ files = glob.glob("g*.txt")
+
+ if not files:
+ print("未找到以'g'开头的txt文件")
+ return
+
+ print(f"找到 {len(files)} 个文件: {', '.join(files)}")
+
+ all_entries = []
+
+ for file_path in files:
+ try:
+ with open(file_path, "r", encoding="utf-8") as file:
+ current_paragraph = []
+
+ for line in file:
+ stripped_line = line.rstrip("\n")
+
+ if stripped_line.strip():
+ current_paragraph.append(stripped_line)
+ else:
+ if current_paragraph:
+ paragraph_text = "\n".join(current_paragraph)
+ cleaned_text = re.sub(r"\(\d+\)", "", paragraph_text)
+ all_entries.append({"text": cleaned_text})
+ current_paragraph = []
+
+ if current_paragraph:
+ paragraph_text = "\n".join(current_paragraph)
+ cleaned_text = re.sub(r"\(\d+\)", "", paragraph_text)
+ all_entries.append({"text": cleaned_text})
+
+ print(f"处理文件 {file_path} 完成")
+
+ except Exception as e:
+ print(f"处理文件 {file_path} 时出错: {e}")
+
+ output_file = "processed_logs.json"
+ with open(output_file, "w", encoding="utf-8") as f:
+ json.dump(all_entries, f, ensure_ascii=False, indent=2)
+
+ print(f"\n处理完成! 共处理 {len(all_entries)} 个段落")
+ print(f"结果已保存到 {output_file}")
+
+
+if __name__ == "__main__":
+ process_g_files()