python爬虫从0到1(五)

scrapy动态cookie、动态ua、ip代理配置

 

在大批量爬取网站的时候,相应网站会有反爬策略,封ip、封账号

所以在爬取的过程中需要不停的切换ip和登录账号等

中间件主要配置
DOWNLOADER_MIDDLEWARES = {
   'qcc.middlewares.RadomUserAgentMiddleware': 543,
   'qcc.middlewares.RandomProxyMiddleware':544,
}
ip代理代码

# 随机选择 IP 代理下载器中间件
class RandomProxyMiddleware(object):
    # 从 settings 的 PROXIES 列表中随机选择一个作为代理
    def process_request(self, request, spider):
        proxy = random.choice(spider.settings['PROXIES'])
        print(proxy)
# 动态替换user-agent和cookie、在上节中已经获取到了cookie
class RadomUserAgentMiddleware(UserAgentMiddleware):
    def process_request(self, request, spider):
        # 随机选择agent
        user_agent_list = settings.USER_AGENTS_LIST
        agent = random.choice(user_agent_list)
        request.headers["User-Agent"] = agent

        # 企查查cookie 例子
        # cookies = {
        #     'acw_tc': 'b4a37a2816167369376712952e3ecc7912491c0db7f9a60b9585b0b56b',
        #     'CNZZDATA1254842228': '472977143-1615263367-%7C1616563623',
        #     'hasShow': '1',
        #     'QCCSESSID': 'vu3n0932lfjrn52mqtolv3go12',
        #     'UM_distinctid': '17863b5fab11a0-0dadf61d248ed2-3f616e4b-fa000-17863b5fab290f',
        #     'zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f': '%7B%22sid%22%3A%201616736937563%2C%22updated%22%3A%201616738533732%2C%22info%22%3A%201616580573818%2C%22superProperty%22%3A%20%22%7B%5C%22%E5%BA%94%E7%94%A8%E5%90%8D%E7%A7%B0%5C%22%3A%20%5C%22%E4%BC%81%E6%9F%A5%E6%9F%A5%E7%BD%91%E7%AB%99%5C%22%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22www.qcc.com%22%2C%22cuid%22%3A%20%22260fbef807fb499d4275ce8b860b721a%22%2C%22zs%22%3A%200%2C%22sc%22%3A%200%7D',
        #     'zg_did': '%7B%22did%22%3A%20%221781569a5bf532-09d0c4d076a24b8-3f616e4b-fa000-1781569a5c0b99%22%7D'
        # }
        # 从redis中随机获取cookie
        ck = RedisUtil().getRandomCookie()
        if len(ck) != 0:
            request.cookies = eval(ck)

 

ip代理不差钱的可以直接买,我用的是免费的代理,不过免费的失效比较快很多ip都不可用,用之前需要check

import requests
import time
import random
from lxml import etree
from queue import Queue
from threading import Thread


def get_ip():
    while True:
        if not q.empty():
            # 验证IP是否可用网址
            url = 'http://httpbin.org/get'
            proxies = q.get()
            try:
                html = requests.get(url, headers=headers, proxies=proxies, timeout=10).text
                print('ip可以用')
                with open('ip.txt', 'a')as f:
                    f.write(str(proxies))
                    f.write('\n')
            except:
                print('ip不可用  下一个')
        else:
            break


def main():
    t_list = []
    for i in range(5):
        t = Thread(target=get_ip)
        t_list.append(t)
        t.start()

    for t in t_list:
        t.join()


ip_list = []
q = Queue()
# 爬取的是800到1000页   靠后的代理可能用的人少点,。自我安慰。。
for i in range(300, 1000):
    url = 'https://www.kuaidaili.com/free/inha/{}'.format(i)
    print(url)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    }
    # proxies = {'http': 'http://211.159.219.225:8118', 'https': 'https://211.159.219.225:8118'}
    # html = requests.get(url, headers=headers,proxies=proxies).text
    html = requests.get(url, headers=headers).text
    # print(html)
    parse_html = etree.HTML(html)
    tr_list = parse_html.xpath('//*[@id="list"]/table/tbody/tr')
    # 延迟访问6到11秒。
    sleep = random.randint(6, 11)
    print(f'等待{sleep}秒')
    time.sleep(sleep)
    print('开始')
    for tr in tr_list[1:]:
        ip = tr.xpath('./td[1]/text()')[0]
        port = tr.xpath('./td[2]/text()')[0]
        proxies = {
            'http': f'http://{ip}:{port}',
            'https': f'https://{ip}:{port}',
        }
        print(proxies)
        # 存入队列
        q.put(proxies)
    main()

print(ip_list)
python
爬虫

关于作者

loyal
获得点赞
文章被阅读