爬虫04:数据持久存储

爬虫•目录 爬虫•类别


Mongo数据库

pymongo

  • 1、连接对象
    conn = pymongo.MongoClient(‘IP’,27017)
  • 2、库对象
    db = conn[‘库名’]
  • 3、集合对象
    myset = db[‘集合名’]
  • 4、执行插入语句
    myset.insert_one({字典})

>>>show dbs
>>>use 库名
>>>show collections
>>>db.集合名.find().pretty()
>>>db.集合名.count()
>>>db.dropDatabase()

案例

import urllib.request
import re 
import pymongo
import time 

class MaoyanSpider(object):
    def __init__(self):
        self.baseurl = 'https://maoyan.com/board/4?offset='
        self.headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)'}
        self.page = 1
        # 连接对象
        self.conn = pymongo.MongoClient(
                         '192.168.56.131',
                         27017)
        # 库对象
        self.db = self.conn['mydb']
        # 集合对象
        self.myset = self.db['top100']

    # 获取页面
    def getPage(self,url):
        # 三步走
        req = urllib.request.Request(url,
                   headers=self.headers)
        res = urllib.request.urlopen(req)
        html = res.read().decode('utf-8')
        self.parsePage(html)

    # 解析页面
    def parsePage(self,html):
        p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>',re.S)
        rList = p.findall(html)
        # rList:[('霸王别姬','张国荣','1993-'),(),()]
        self.writeMongo(rList)

    # 保存数据
    def writeMongo(self,rList):
        for rt in rList:
            d = {
                "name" : rt[0].strip(),
                "star" : rt[1].strip(),
                "time" : rt[2].strip()
              }
            # 插入到mongo数据库
            self.myset.insert_one(d)

    # 主函数
    def workOn(self):
        for pg in range(0,21,10):
            url = self.baseurl+str(pg)
            self.getPage(url)
            print('第%d页爬取成功' % self.page)
            time.sleep(3)
            self.page += 1

if __name__ == '__main__':
    spider = MaoyanSpider()
    spider.workOn()

Mysql数据库

1、创建数据库对象
2、创建游标对象
3、执行命令
4、(重要)提交到数据库执行

1、建库
mysql> create database mydb charset=utf8;
2、建表
use mydb;
create table top100(
id int primary key auto_increment,
name varchar(50),
star varchar(150),
time varchar(50)
)charset=utf8;

远程存入MySQL数据库

1.开启远程链接

sudo -i
cd /etc/mysql/mysql.conf.d/
vi mysqld.cnf
注释掉下面
# bind-address = 127.0.0.1
srevice mysql restart

2.添加授权用户

mysql> grant all privileges on *.* to 'Jent'@'%' identified by '123456' with grant option;

3.添加Ubuntu防火墙规则

sudo ufw allow 3306

Ubuntu防火墙ufw

  • 打开防火墙
    sudo ufw enable
  • 关闭防火墙
    sudo ufw disable
  • 添加防火墙规则
    sudo ufw allow 端口号
  • 查看状态
    sudo ufw status
import urllib.request
import re 
import pymysql
import time 
import warnings

class MaoyanSpider(object):
    def __init__(self):
        self.baseurl = 'https://maoyan.com/board/4?offset='
        self.headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)'}
        self.page = 1
        # 创建数据库对象
        self.db = pymysql.connect(
                    '192.168.56.131',
                    'lion','123456','mydb',
                    charset='utf8')
        # 创建游标对象
        self.cursor = self.db.cursor()

    # 获取页面
    def getPage(self,url):
        # 三步走
        req = urllib.request.Request(url,
                   headers=self.headers)
        res = urllib.request.urlopen(req)
        html = res.read().decode('utf-8')
        self.parsePage(html)

    # 解析页面
    def parsePage(self,html):
        p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>',re.S)
        rList = p.findall(html)
        # rList:[('霸王别姬','张国荣','1993-'),(),()]
        self.writeMysql(rList)

    # 保存数据
    def writeMysql(self,rList):
        # 忽略警告
        warnings.filterwarnings('ignore')
        ins = 'insert into top100(name,star,\
               time) values(%s,%s,%s)'
        for rt in rList:
            L = [
                    rt[0].strip(),
                    rt[1].strip(),
                    rt[2].strip()[5:15]
                ]
            # execute要使用列表传参
            self.cursor.execute(ins,L)
            self.db.commit()

    # 主函数
    def workOn(self):
        for pg in range(0,11,10):
            url = self.baseurl+str(pg)
            self.getPage(url)
            print('第%d页爬取成功' % self.page)
            time.sleep(3)
            self.page += 1
        # 等所有页面爬完后再关闭数据库
        self.cursor.close()
        self.db.close()

if __name__ == '__main__':
    spider = MaoyanSpider()
    spider.workOn()

博主个人能力有限,错误在所难免.
如发现错误请不要吝啬,发邮件给博主更正内容,在此提前鸣谢.
Email: JentChang@163.com (来信请注明文章标题,如果附带链接就更方便了)
你也可以在下方的留言板留下你宝贵的意见.


目录