utils/process_log.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53

import glob
import json
import re


def process_g_files():
    files = glob.glob("g*.txt")

    if not files:
        print("未找到以'g'开头的txt文件")
        return

    print(f"找到 {len(files)} 个文件: {', '.join(files)}")

    all_entries = []

    for file_path in files:
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                current_paragraph = []

                for line in file:
                    stripped_line = line.rstrip("\n")

                    if stripped_line.strip():
                        current_paragraph.append(stripped_line)
                    else:
                        if current_paragraph:
                            paragraph_text = "\n".join(current_paragraph)
                            cleaned_text = re.sub(r"\(\d+\)", "", paragraph_text)
                            all_entries.append({"text": cleaned_text})
                            current_paragraph = []

                if current_paragraph:
                    paragraph_text = "\n".join(current_paragraph)
                    cleaned_text = re.sub(r"\(\d+\)", "", paragraph_text)
                    all_entries.append({"text": cleaned_text})

            print(f"处理文件 {file_path} 完成")

        except Exception as e:
            print(f"处理文件 {file_path} 时出错: {e}")

    output_file = "processed_logs.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(all_entries, f, ensure_ascii=False, indent=2)

    print(f"\n处理完成! 共处理 {len(all_entries)} 个段落")
    print(f"结果已保存到 {output_file}")


if __name__ == "__main__":
    process_g_files()