trec07p_format.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. import os
  2. from email.parser import Parser
  3. from email.policy import default
  4. from bs4 import BeautifulSoup
  5. # 该脚本用于整理trec06c数据集,可以生成训练集和测试集数据格式
  6. def getData(path):
  7. f = open(path, 'r', errors='ignore')
  8. data = f.read()
  9. headers = Parser(policy=default).parsestr(data)
  10. body = ""
  11. if headers.is_multipart():
  12. for part in headers.iter_parts():
  13. tbody = part.get_payload()
  14. if isinstance(tbody, list):
  15. for item in tbody:
  16. txt = item.get_payload()
  17. if isinstance(tbody, list):
  18. return "", ""
  19. bsObj = BeautifulSoup(txt, 'lxml')
  20. body += bsObj.get_text()
  21. else:
  22. bsObj = BeautifulSoup(tbody, 'lxml')
  23. body += bsObj.get_text()
  24. else:
  25. tbody = headers.get_payload()
  26. bsObj = BeautifulSoup(tbody, 'lxml')
  27. body += bsObj.get_text()
  28. return headers["subject"], body.replace("\n", "")
  29. num = 0
  30. # getData("../data/000/000")
  31. with open("index", "r") as f:
  32. with open("trec07p_train.csv", "w") as w:
  33. with open("trec07p_test.csv", "w") as wt:
  34. while True:
  35. line = f.readline()
  36. if not line:
  37. break
  38. infos = line.split(" ")
  39. subject, body = getData(infos[1].strip())
  40. if subject == "":
  41. continue
  42. tp = 0
  43. if infos[0].lower() == "spam":
  44. tp = 1
  45. data = "{} \t{} {}\n".format(tp, subject, body)
  46. if num < 55000:
  47. w.write(data)
  48. else:
  49. wt.write(data)
  50. num += 1
  51. print(num)