Mongo数据库
pymongo
- 1、连接对象
conn = pymongo.MongoClient(‘IP’,27017) - 2、库对象
db = conn[‘库名’] - 3、集合对象
myset = db[‘集合名’] - 4、执行插入语句
myset.insert_one({字典})
>>>show dbs
>>>use 库名
>>>show collections
>>>db.集合名.find().pretty()
>>>db.集合名.count()
>>>db.dropDatabase()
案例
import urllib.request
import re
import pymongo
import time
class MaoyanSpider(object):
def __init__(self):
self.baseurl = 'https://maoyan.com/board/4?offset='
self.headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)'}
self.page = 1
# 连接对象
self.conn = pymongo.MongoClient(
'192.168.56.131',
27017)
# 库对象
self.db = self.conn['mydb']
# 集合对象
self.myset = self.db['top100']
# 获取页面
def getPage(self,url):
# 三步走
req = urllib.request.Request(url,
headers=self.headers)
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
self.parsePage(html)
# 解析页面
def parsePage(self,html):
p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>',re.S)
rList = p.findall(html)
# rList:[('霸王别姬','张国荣','1993-'),(),()]
self.writeMongo(rList)
# 保存数据
def writeMongo(self,rList):
for rt in rList:
d = {
"name" : rt[0].strip(),
"star" : rt[1].strip(),
"time" : rt[2].strip()
}
# 插入到mongo数据库
self.myset.insert_one(d)
# 主函数
def workOn(self):
for pg in range(0,21,10):
url = self.baseurl+str(pg)
self.getPage(url)
print('第%d页爬取成功' % self.page)
time.sleep(3)
self.page += 1
if __name__ == '__main__':
spider = MaoyanSpider()
spider.workOn()
Mysql数据库
1、创建数据库对象
2、创建游标对象
3、执行命令
4、(重要)提交到数据库执行
例
1、建库mysql> create database mydb charset=utf8;
2、建表use mydb;
create table top100(
id int primary key auto_increment,
name varchar(50),
star varchar(150),
time varchar(50)
)charset=utf8;
远程存入MySQL数据库
1.开启远程链接
sudo -i
cd /etc/mysql/mysql.conf.d/
vi mysqld.cnf
注释掉下面
# bind-address = 127.0.0.1
srevice mysql restart
2.添加授权用户
mysql> grant all privileges on *.* to 'Jent'@'%' identified by '123456' with grant option;
3.添加Ubuntu防火墙规则
sudo ufw allow 3306
Ubuntu防火墙ufw
- 打开防火墙
sudo ufw enable
- 关闭防火墙
sudo ufw disable
- 添加防火墙规则
sudo ufw allow 端口号
- 查看状态
sudo ufw status
import urllib.request
import re
import pymysql
import time
import warnings
class MaoyanSpider(object):
def __init__(self):
self.baseurl = 'https://maoyan.com/board/4?offset='
self.headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)'}
self.page = 1
# 创建数据库对象
self.db = pymysql.connect(
'192.168.56.131',
'lion','123456','mydb',
charset='utf8')
# 创建游标对象
self.cursor = self.db.cursor()
# 获取页面
def getPage(self,url):
# 三步走
req = urllib.request.Request(url,
headers=self.headers)
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
self.parsePage(html)
# 解析页面
def parsePage(self,html):
p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>',re.S)
rList = p.findall(html)
# rList:[('霸王别姬','张国荣','1993-'),(),()]
self.writeMysql(rList)
# 保存数据
def writeMysql(self,rList):
# 忽略警告
warnings.filterwarnings('ignore')
ins = 'insert into top100(name,star,\
time) values(%s,%s,%s)'
for rt in rList:
L = [
rt[0].strip(),
rt[1].strip(),
rt[2].strip()[5:15]
]
# execute要使用列表传参
self.cursor.execute(ins,L)
self.db.commit()
# 主函数
def workOn(self):
for pg in range(0,11,10):
url = self.baseurl+str(pg)
self.getPage(url)
print('第%d页爬取成功' % self.page)
time.sleep(3)
self.page += 1
# 等所有页面爬完后再关闭数据库
self.cursor.close()
self.db.close()
if __name__ == '__main__':
spider = MaoyanSpider()
spider.workOn()
博主个人能力有限,错误在所难免.
如发现错误请不要吝啬,发邮件给博主更正内容,在此提前鸣谢.
Email: JentChang@163.com (来信请注明文章标题,如果附带链接就更方便了)
你也可以在下方的留言板留下你宝贵的意见.