| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657 |
- import os
- from email.parser import Parser
- from email.policy import default
- from bs4 import BeautifulSoup
- # 该脚本用于整理trec06c数据集,可以生成训练集和测试集数据格式
- def getData(path):
- f = open(path, 'r', errors='ignore')
- data = f.read()
- headers = Parser(policy=default).parsestr(data)
- body = ""
- if headers.is_multipart():
- for part in headers.iter_parts():
- tbody = part.get_payload()
- if isinstance(tbody, list):
- for item in tbody:
- txt = item.get_payload()
- if isinstance(tbody, list):
- return "", ""
- bsObj = BeautifulSoup(txt, 'lxml')
- body += bsObj.get_text()
- else:
- bsObj = BeautifulSoup(tbody, 'lxml')
- body += bsObj.get_text()
- else:
- tbody = headers.get_payload()
- bsObj = BeautifulSoup(tbody, 'lxml')
- body += bsObj.get_text()
- return headers["subject"], body.replace("\n", "")
- num = 0
- # getData("../data/000/000")
- with open("index", "r") as f:
- with open("trec07p_train.csv", "w") as w:
- with open("trec07p_test.csv", "w") as wt:
- while True:
- line = f.readline()
- if not line:
- break
- infos = line.split(" ")
- subject, body = getData(infos[1].strip())
- if subject == "":
- continue
- tp = 0
- if infos[0].lower() == "spam":
- tp = 1
- data = "{} \t{} {}\n".format(tp, subject, body)
- if num < 55000:
- w.write(data)
- else:
- wt.write(data)
- num += 1
- print(num)
|