整个DBPipeline类
# -*- coding: utf-8 -*-
import pymysql
import logging
import json
import time
import sys
reload(sys)
sys.setdefaultencoding('utf8')
logger = logging.getLogger(__name__)
# 引用当前文件夹下面的配置文件
import config
mysql_config = config.mysql_config
# 用于数据库存储
class DBPipeline(object):
def __init__(self):
# 连接数据库
self.connect = pymysql.connect(
host=mysql_config['MYSQL_HOST'],
port=mysql_config['MYSQL_PORT'],
db=mysql_config['MYSQL_DBNAME'],
user=mysql_config['MYSQL_USER'],
passwd=mysql_config['MYSQL_PASSWD'],
charset='utf8',
use_unicode=True
)
# 通过cursor执行增删查改
self.cursor = self.connect.cursor();
def process_item(self, item, spider):
try:
# 查重处理
self.cursor.execute(
"""select * from mymovie where img_url = %s""",
item['img_url'])
# 是否有重复数据
repetition = self.cursor.fetchone()
# 重复
if repetition:
pass
else:
# 插入数据
self.cursor.execute(
"""insert into mymovie (name, info, rating, num ,quote, img_url)
value (%s, %s, %s, %s, %s, %s)""",
(item['name'],
item['info'],
item['rating'],
item['num'],
item['quote'],
item['img_url']))
# 提交sql语句
self.connect.commit()
except Exception as error:
# 出现错误时打印错误日志
log(error)
return item
最后在settings文件中注册DBPipeline:
ITEM_PIPELINES = {
'movie.MoviePipelines.DBPipeline': 10,
}