41 lines
1.6 KiB
Python
41 lines
1.6 KiB
Python
import os
|
|
from datetime import datetime
|
|
|
|
files = os.walk("download/QQMsg")
|
|
|
|
outputFile = open("data/train_qq.txt", "w", encoding="utf-8")
|
|
|
|
for path, dir_list, file_list in files:
|
|
for file_name in file_list:
|
|
print(os.path.join(path, file_name))
|
|
f = open(os.path.join(path, file_name), "r", encoding="utf-8")
|
|
lines = f.readlines()
|
|
stat = 0 # 0: ready to parse time / 1: ready to parse log
|
|
lastTime = datetime.strptime("1970-1-1 00:00:00", "%Y-%m-%d %H:%M:%S")
|
|
for i in range(8, len(lines)):
|
|
raw = lines[i].replace("\r\n", "").replace("\n", "")
|
|
|
|
# 这一行是时间
|
|
timeStrs = raw.split(' ', 2)
|
|
try:
|
|
# 这一行是时间
|
|
if timeStrs[0][0] == '2':
|
|
tsStr = timeStrs[0] + " " + timeStrs[1]
|
|
else:
|
|
tsStr = timeStrs[1] + " " + timeStrs[2]
|
|
ts = datetime.strptime(tsStr, "%Y-%m-%d %H:%M:%S")
|
|
if ((ts - lastTime).seconds > 120) or ((ts - lastTime).seconds < 0):
|
|
# 间隔2分钟以上,认为是不同的对话
|
|
outputFile.write("\n")
|
|
lastTime = ts
|
|
except (IndexError, ValueError) as e:
|
|
# 这一行是消息
|
|
msg = raw.replace("[图片]", "").replace("[表情]", "").replace("[合并转发]请使用手机QQ最新版本查看", "")
|
|
if msg != "":
|
|
# 是有效行
|
|
outputFile.write(msg + "\n")
|
|
|
|
f.close()
|
|
|
|
outputFile.close()
|