# 练习 3-4 垃圾邮件分类

In [1]:
# 定义一个提示下载进度的方法
from tqdm import tqdm, tqdm_notebook
def progress_hook(t):
    """
    封装一个tqdm实例。当结束的时候不要忘了调用close()或者__exit__()，最简单的方法是使用with语法

    示例
    -------

    >>> with tqdm() as t:
    ...     reporthook = my_hook(t)
    ...     urllib.urlretrieve(..., reporthook=reporthook)    

    """
    last_b = [0]

    def inner(b=1, bsize=1, tsize=None):
        """
        b   : int, optional
              已经处理的blocks的数量，默认为1
        bsize   : int, optional
                    每个block的大小（以tqdm的单位计算）， 默认为1
        tsize   : int, optional
                    总共的size（tqdm单位），默认为None，表示不变
        """
        if tsize is not None:
            t.total = tsize
        t.update((b - last_b[0]) * bsize)
        last_b[0] = b
    return inner

## 1. 从Apache SpamAssassin的公共数据集(http://spamassassin.apache.org/old/publiccorpus/)中下载垃圾邮件和非垃圾邮件

In [2]:
from urllib import request
import os
import tarfile

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("./datasets", "spam")

def fetch_spam_data(spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", HAM_URL), ("spam.tar.bz2", SPAM_URL)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            print("from {} download file saved to {}\n".format(url, path))
            with tqdm(unit='B', unit_scale=True, leave=True, miniters=1) as t:
                request.urlretrieve(url, path, reporthook=progress_hook(t))
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=SPAM_PATH)
        tar_bz2_file.close()

In [3]:
fetch_spam_data()

0.00B [00:00, ?B/s]from http://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2 download file saved to ./datasets/spam/ham.tar.bz2

1.61MB [02:16, 11.8kB/s]                            
0.00B [00:00, ?B/s]from http://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2 download file saved to ./datasets/spam/spam.tar.bz2

1.19MB [01:13, 16.2kB/s]


加载所有的邮件数据

In [4]:
HAM_DIR = os.path.join(SPAM_PATH, 'easy_ham')
SPAM_DIR = os.path.join(SPAM_PATH, 'spam')
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [5]:
sorted(os.listdir(HAM_DIR))

 '00536.ee0a85c68f0db6388d6f8a3468af70ac',
 '00537.0b676300c214afdc2dd6a5007e8b8e2a',
 '00538.313057481a3b1638c05066fad46ae65f',
 '00539.f3a3a009e8410ed004b045e724be525a',
 '00540.c9e660864381381e6a16c599c8f2e1fe',
 '00541.cbdcefd1a6109b8f95e1c8dddfbd7bb2',
 '00542.1db9cca8020648c0ed80436b9aea4d33',
 '00543.0641e755767b41b404070e155708cee6',
 '00544.5a9365cf80100b89b50656045cb8b80c',
 '00545.99996f28814c7028ce6aac44270ff3cd',
 '00546.30fed3b8e986dc41b1865b9285e84e56',
 '00547.59bf01e07cf08c7e3a778b747e020989',
 '00548.120e45c5d33311bc09e844bf236521d2',
 '00549.a847ea8934802a0ec67a7fd1d136d26d',
 '00550.02e6c81fb637ae555b997d6fd72df731',
 '00551.1c59fd8e4f3176c859b79b9a75fcc3b6',
 '00552.2a17c933697e682b93c2f32b66230a3b',
 '00553.d1e0ab732c8cbe70432e98301e352954',
 '00554.a01a74aee9653a7ae8d1d558c75f0a5d',
 '00555.4139b207075574b3774942a3a42013d8',
 '00556.eea9a26e128c1b4b676b4180adbd547a',
 '00557.62ed7b82fd342ca4d7932ccee2552337',
 '00558.95b8c2677759a2f569a5dc0bd70b8cc0',
 '00559.caa