爬虫11：多线程爬取

爬虫Spider 线程局部页面刷新

Spider 2019-01-22

爬虫•目录爬虫•类别

多线程爬取

队列
from multiprocessing import Queue
put()
get() （get_nowait() => get(False)）

线程创建

from threding import Thread
t = Thread(target=函数名)
t.start()
t.join()

小米应用商店

小米应用商店应用分类　抓取：应用名称，应用链接

http://app.mi.com/categotyAllListApi?page=2&categoryId=10&pageSize=30

F12 -> QueryString
page: xx
categoryId: 10
pageSize: 30

代码

import requests
from multiprocessing import Queue
from threading import Thread
import json
import urllib.parse
import time

class MiSpider(object):
    def __init__(self):
        self.baseurl = 'http://app.mi.com/categotyAllListApi?'
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}

        # URL队列
        self.urlQueue = Queue()
        # 解析队列
        self.parseQueue = Queue()

    def getUrl(self):
        '''拼接地址　入队列
        '''
        for page in range(10):
            params = {
                'page': str(page),
                'categoryId': '10',
                'pageSize': '30'
            }
            # 编码　拼接　入队列
            params = urllib.parse.urlencode(params)
            url = self.baseurl + params
            self.urlQueue.put(url)

    def getHtml(self):
        '''发送请求　把html给解析队列
        '''
        while not self.urlQueue.empty():
            url = self.urlQueue.get()
            res =requests.get(url, headers=self.headers)
            res.encoding = 'utf-8'
            html = res.text

            self.parseQueue.put(html)

    def parseHtml(self):
        '''提取
        '''
        while not self.parseQueue.empty():
            html = self.parseQueue.get()
            hList = json.loads(html)['data']
            # hList [{},{}]
            for h in hList:
                name = h['displayName']
                d = {'应用名称': name, '应用链接': 'http://app.mi.com/details?id='+h['packageName']}
                with open('xiaomiAPP.json', 'a') as f:
                    f.write(str(d) + '\n')

    def workOn(self):
        self.getUrl()
        t1List = []
        t2List = []

        # 采集线程
        for i in range(3):
            t = Thread(target=self.getHtml)
            t1List.append(t)
            t.start()

        # 回收线程
        for i in t1List:
            i.join()

        # 解析
        for i in range(3):
            t = Thread(target=self.parseHtml)
            t2List.append(t)
            t.start()

        for i in t2List:
            i.join()


if __name__ == "__main__":
    begin = time.time()
    spider = MiSpider()
    spider.workOn()
    end = time.time()
    print('sucessful within ___%.2f___' % (end-begin))