Scrapy 升级前面python抓取全部图集谷女孩图片,这次抓取某女孩全部写真集,有能力自己改写抓取全部,要替换自己喜欢女孩地址
抓取,全部,女孩,自己,Scrapy,升级,前面,python,图集,图片,这次,写真集,能力,改写,替换,喜欢,地址
2025-04-01 16:27:57 时间
大家好,又见面了,我是你们的朋友全栈君。
首先创建 ImagesRename
在spiders 里面创建 ImgRename.py 输入代码
import scrapy
from ImagesRename.items import ImagesrenameItem
class ImgsrenameSpider(scrapy.Spider):
name = 'tujigu'
start_urls = ['https://www.tujigu.com/a/28177/'] #替换自己喜欢女孩地址
def parse(self, response):
# 实例化item
item = ImagesrenameItem()
# 注意imgurls是一个集合也就是多张图片
item['imgurl'] = response.xpath("//div[@class='content']//@src").extract()
# 抓取文章标题作为图集名称
item['imgname'] = response.xpath("//div[@class='weizhi']//h1").extract_first()
yield item
# 提取图片,存入文件夹
# print(item['ImgUrl'])
next_page = response.xpath('//*[@id="pages"]//a[11]//@href').extract_first()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
在items里面替换
import scrapy
class ImagesrenameItem(scrapy.Item):
# define the fields for your item here like:
imgurl = scrapy.Field()
imgname = scrapy.Field()
pass
在middlewares 里面添加
class NovelUserAgentMiddleWare(object): #随即user_AGENT
def __init__(self):
self.user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
]
def process_request(self, request, spider):
import random
ua = random.choice(self.user_agent_list)
print('User-Agent:' + ua)
request.headers.setdefault('User-Agent', ua)
class NovelProxyMiddleWare(object): #随即IP
def process_request(self, request, spider):
proxy = self.get_random_proxy()
print("Request proxy is {}".format(proxy))
request.meta["proxy"] = "http://" + proxy
def get_random_proxy(self):
import random
with open('a.txt', 'r', encoding="utf-8") as f:#打开IP的地址
txt = f.read()
return random.choice(txt.split('\n'))
在pipelines里面替换
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class ImagesrenamePipeline:
def process_item(self, item, spider):
return item
import re
from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
class ImagesrenamePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
# 循环每一张图片地址下载,若传过来的不是集合则无需循环直接yield
for image_url in item['imgurl']:
# meta里面的数据是从spider获取,然后通过meta传递给下面方法:file_path
yield Request(image_url,meta={'name':item['imgname']})
# 重命名,若不重写这函数,图片名为哈希,就是一串乱七八糟的名字
def file_path(self, request, response=None, info=None):
# 提取url前面名称作为图片名。
image_guid = request.url.split('/')[-1]
# 接收上面meta传递过来的图片名称
name = request.meta['name']
# 过滤windows字符串,不经过这么一个步骤,你会发现有乱码或无法下载
#name = re.sub(u"([^\u4e00-\u9fa5])", "", name)
name = re.findall(r'[^<>/h1第0-9页NO. ]', name)
name= ''.join(name)
# 分文件夹存储的关键:{0}对应着name;{1}对应着image_guid
filename = u'{0}/{1}'.format(name, image_guid)
return filename
最后settings里添加
BOT_NAME = 'ImagesRename'
SPIDER_MODULES = ['ImagesRename.spiders']
NEWSPIDER_MODULE = 'ImagesRename.spiders'
RETRY_ENABLED = True #打开重试开关
RETRY_TIMES = 20 #重试次数
DOWNLOAD_TIMEOUT = 3 #超时
RETRY_HTTP_CODES = [429,404,403] #重试
ITEM_PIPELINES = {
'ImagesRename.pipelines.ImagesrenamePipeline': 300,
}
DOWNLOADER_MIDDLEWARES = {
'ImagesRename.middlewares.NovelUserAgentMiddleWare': 544, #随即user
'ImagesRename.middlewares.NovelProxyMiddleWare': 543,#随即IP
}
#没有在middlewares添加随即,就不用添加
# 设置图片存储目录
IMAGES_STORE = 'D:\学习\pythonProject\scrapy\ImagesRename'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'ImagesRename (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
发布者:全栈程序员栈长,转载请注明出处:https://javaforall.cn/154228.html原文链接:https://javaforall.cn
相关文章
- 240个Python练习案例附源码(百看不如一练)
- Python进阶41-drf框架(三)
- 每天五分钟学Python,数字和字符串的基本用法
- 工具推荐|利用python-cdo高效处理气象数据
- python实现卷积操作
- Python实战 | 送亲戚,送长辈,“ 月饼 ”可视化大屏来帮忙。
- 王力宏的瓜很大!我用Python爬取了瓜文评论区,发现更精彩
- 你一定是在逗我,Python都不会就想做算法?
- CPD 算法实现点云配准(python版本)[通俗易懂]
- Python 哈希表查询_进入<哈希函数>为结界的世界
- 用Python画一棵带音乐的雪夜圣诞树
- 【敬初学者】Python基础学完了,该怎么知道自己学的怎么样呢?十个经典实战小项目附源码
- Python实现门禁管理系统
- python:最大公约数和最小公倍数
- python中的变量命名规则
- python线性回归算法「建议收藏」
- [Python图像处理] 十一.灰度直方图概念及OpenCV绘制直方图
- python测试框架unittest如何设置用例优先级_python 的 unittest 测试框架中的测试依赖怎么解决呢…[通俗易懂]
- 用Python做图像处理[通俗易懂]
- 用python实现线性回归算法