trec06c_format.py 1.1 KB

1234567891011121314151617181920212223242526272829303132333435363738
  1. import os
  2. from email.parser import Parser
  3. from email.policy import default
  4. # 该脚本用于整理trec06c数据集,可以生成训练集和测试集数据格式
  5. def getData(path):
  6. f = open(path, 'r', encoding='gb2312', errors='ignore')
  7. data = f.read()
  8. headers = Parser(policy=default).parsestr(data)
  9. body = headers.get_payload()
  10. body = body.replace("\n", "")
  11. return headers["subject"], body
  12. num = 0
  13. # getData("../data/000/000")
  14. with open("index", "r") as f:
  15. with open("trec06c_train.csv", "w") as w:
  16. with open("trec06c_test.csv", "w") as wt:
  17. while True:
  18. line = f.readline()
  19. if not line:
  20. break
  21. infos = line.split(" ")
  22. subject, body = getData(infos[1].strip())
  23. tp = 0
  24. if infos[0].lower() == "spam":
  25. tp = 1
  26. data = "{} \t{} {}\n".format(tp, subject, body)
  27. if num < 55000:
  28. w.write(data)
  29. else:
  30. wt.write(data)
  31. num += 1
  32. print(num)