python爬虫从0到1（六）

scrapy爬取企查查企业数据实战
在前几节内容中，已经介绍完了scrapy框架和应对反爬的机制，上述的配置做完后可以开始相应的内容爬取，这里我以企查查为例
直接上代码：

# 工商信息
class GSXXSpider(scrapy.Spider):
    name = 'gsxx_detail'
    allowed_domains = ['qcc.com']
    start_urls = ['https://www.qcc.com']

    def start_requests(self):
        page = 0
        pagesize = 200
        conn = MysqlUtil()
        count = conn.get_count('select * from qcc_company_list')
        totalPage = round(count / pagesize)
        while page <= totalPage:
            result = conn.get_all(
                'select detail_url,id,company_name from qcc_company_list limit ' + str(page) + ',' + str(
                    pagesize * page + pagesize))
            page = page + 1
            if result:
                for su in result:
                    yield scrapy.Request(su[0], callback=self.parse, meta={'company_id': su[1], 'company_name': su[2]})

    def parse(self, response):
        item = GSXXItem()
        trList = response.xpath('//*[@id="Cominfo"]/table[@class="ntable"]/tr')
        item['company_id'] = response.meta['company_id']
        item['company_name'] = response.meta['company_name']
        item['fddbr'] = trList[0].xpath('normalize-space(.//td[2]/div/div/div[2]//a/h2//text())').extract_first()
        item['djzt'] = trList[0].xpath('normalize-space(.//td[4]//text())').extract_first()
        item['clrq'] = trList[0].xpath('normalize-space(.//td[6]//text())').extract_first()
        item['zczb'] = trList[1].xpath('normalize-space(.//td[2]//text())').extract_first()
        item['sjzb'] = trList[1].xpath('normalize-space(.//td[4]//text())').extract_first()
        item['hzrq'] = trList[1].xpath('normalize-space(.//td[6]//text())').extract_first()
        item['xydm'] = trList[2].xpath('normalize-space(.//td[2]//text())').extract_first()
        item['jgdm'] = trList[2].xpath('normalize-space(.//td[4]//text())').extract_first()
        item['gszch'] = trList[2].xpath('normalize-space(.//td[6]//text())').extract_first()
        item['nsrsbh'] = trList[3].xpath('normalize-space(.//td[2]//text())').extract_first()
        item['qydm'] = trList[3].xpath('normalize-space(.//td[4]//text())').extract_first()
        item['sshy'] = trList[3].xpath('normalize-space(.//td[6]//text())').extract_first()
        item['qylx'] = trList[4].xpath('normalize-space(.//td[2]//text())').extract_first()
        item['yyqx'] = trList[4].xpath('normalize-space(.//td[4]//text())').extract_first()
        item['djjg'] = trList[4].xpath('normalize-space(.//td[6]//text())').extract_first()
        item['rygm'] = trList[5].xpath('normalize-space(.//td[2]//text())').extract_first()
        item['cbrs'] = trList[5].xpath('normalize-space(.//td[4]//text())').extract_first()
        item['ssdq'] = trList[5].xpath('normalize-space(.//td[6]//text())').extract_first()
        cym = trList[6].xpath('.//td[2]/span')
        ceyongming = ''
        if cym != '':
            for c in cym:
                n = c.xpath('normalize-space(.//text())').extract_first()
                ceyongming = ceyongming + n
        item['cym'] = ceyongming
        item['ywm'] = trList[6].xpath('normalize-space(.//td[4]//text())').extract_first()
        item['qydz'] = trList[7].xpath('normalize-space(.//td[2]//text())').extract_first()
        item['jyfw'] = trList[8].xpath('normalize-space(.//td[2]//text())').extract_first()

        yield item


# 股东信息
class GDXXSpider(scrapy.Spider):
    name = 'gdxx_detail'
    allowed_domains = ['qcc.com']
    start_urls = []

    def start_requests(self):
        page = 0
        pagesize = 200
        conn = MysqlUtil()
        count = conn.get_count('select * from qcc_company_list')
        totalPage = round(count / pagesize)
        while page <= totalPage:
            result = conn.get_all(
                'select detail_url,id,company_name from qcc_company_list limit ' + str(page) + ',' + str(
                    pagesize * page + pagesize))
            page = page + 1
            if result:
                for su in result:
                    yield scrapy.Request(su[0], callback=self.parse, meta={'company_id': su[1]})

    def parse(self, response):
        item = GDXXItem()
        trList = response.xpath('//*[@id="partnern"]/div[@id="partnerslist"]/table/tr')
        for tr in trList[1:]:
            item['company_id'] = response.meta['company_id']
            item['gdmc'] = tr.xpath('normalize-space(.//td[2]/table/tr/td[2]/a/span//text())').extract_first()
            bl = tr.xpath('.//td[3]')
            item['cgbl'] = bl[1].xpath('normalize-space(.//text())').extract_first()
            item['rjcze'] = tr.xpath('normalize-space(.//td[4]//text())').extract_first()
            item['rjczrq'] = tr.xpath('normalize-space(.//td[5]//text())').extract_first()
            yield item


# 主要人员信息
class ZYRYSpider(scrapy.Spider):
    name = 'zyry_detail'
    allowed_domains = ['qcc.com']
    start_urls = []

    def start_requests(self):
        page = 0
        pagesize = 200
        conn = MysqlUtil()
        count = conn.get_count('select * from qcc_company_list')
        totalPage = round(count / pagesize)
        while page <= totalPage:
            result = conn.get_all(
                'select detail_url,id,company_name from qcc_company_list limit ' + str(page) + ',' + str(
                    pagesize * page + pagesize))
            page = page + 1
            if result:
                for su in result:
                    yield scrapy.Request(su[0], callback=self.parse, meta={'company_id': su[1]})

    def parse(self, response):
        item = GDXXItem()
        trList = response.xpath('//*[@id="Mainmember"]/div[@id="employeeslist"]/table/tr')
        for tr in trList[1:]:
            item['company_id'] = response.meta['company_id']
            item['name'] = tr.xpath('normalize-space(.//td[2]/a/span//text())').extract_first()
            item['job'] = tr.xpath('normalize-space(.//td[2]/a/span//text())').extract_first()
            item['cgbl'] = tr.xpath('normalize-space(.//td[2]/a/span//text())').extract_first()
            item['sygf'] = tr.xpath('normalize-space(.//td[2]/a/span//text())').extract_first()