python爬虫从0到1(六)

scrapy爬取企查查企业数据实战

在前几节内容中,已经介绍完了scrapy框架和应对反爬的机制,上述的配置做完后可以开始相应的内容爬取,这里我以企查查为例

直接上代码:


# 工商信息
class GSXXSpider(scrapy.Spider):
name = 'gsxx_detail'
allowed_domains = ['qcc.com']
start_urls = ['https://www.qcc.com']

def start_requests(self):
page = 0
pagesize = 200
conn = MysqlUtil()
count = conn.get_count('select * from qcc_company_list')
totalPage = round(count / pagesize)
while page <= totalPage:
result = conn.get_all(
'select detail_url,id,company_name from qcc_company_list limit ' + str(page) + ',' + str(
pagesize * page + pagesize))
page = page + 1
if result:
for su in result:
yield scrapy.Request(su[0], callback=self.parse, meta={'company_id': su[1], 'company_name': su[2]})

def parse(self, response):
item = GSXXItem()
trList = response.xpath('//*[@id="Cominfo"]/table[@class="ntable"]/tr')
item['company_id'] = response.meta['company_id']
item['company_name'] = response.meta['company_name']
item['fddbr'] = trList[0].xpath('normalize-space(.//td[2]/div/div/div[2]//a/h2//text())').extract_first()
item['djzt'] = trList[0].xpath('normalize-space(.//td[4]//text())').extract_first()
item['clrq'] = trList[0].xpath('normalize-space(.//td[6]//text())').extract_first()
item['zczb'] = trList[1].xpath('normalize-space(.//td[2]//text())').extract_first()
item['sjzb'] = trList[1].xpath('normalize-space(.//td[4]//text())').extract_first()
item['hzrq'] = trList[1].xpath('normalize-space(.//td[6]//text())').extract_first()
item['xydm'] = trList[2].xpath('normalize-space(.//td[2]//text())').extract_first()
item['jgdm'] = trList[2].xpath('normalize-space(.//td[4]//text())').extract_first()
item['gszch'] = trList[2].xpath('normalize-space(.//td[6]//text())').extract_first()
item['nsrsbh'] = trList[3].xpath('normalize-space(.//td[2]//text())').extract_first()
item['qydm'] = trList[3].xpath('normalize-space(.//td[4]//text())').extract_first()
item['sshy'] = trList[3].xpath('normalize-space(.//td[6]//text())').extract_first()
item['qylx'] = trList[4].xpath('normalize-space(.//td[2]//text())').extract_first()
item['yyqx'] = trList[4].xpath('normalize-space(.//td[4]//text())').extract_first()
item['djjg'] = trList[4].xpath('normalize-space(.//td[6]//text())').extract_first()
item['rygm'] = trList[5].xpath('normalize-space(.//td[2]//text())').extract_first()
item['cbrs'] = trList[5].xpath('normalize-space(.//td[4]//text())').extract_first()
item['ssdq'] = trList[5].xpath('normalize-space(.//td[6]//text())').extract_first()
cym = trList[6].xpath('.//td[2]/span')
ceyongming = ''
if cym != '':
for c in cym:
n = c.xpath('normalize-space(.//text())').extract_first()
ceyongming = ceyongming + n
item['cym'] = ceyongming
item['ywm'] = trList[6].xpath('normalize-space(.//td[4]//text())').extract_first()
item['qydz'] = trList[7].xpath('normalize-space(.//td[2]//text())').extract_first()
item['jyfw'] = trList[8].xpath('normalize-space(.//td[2]//text())').extract_first()

yield item


# 股东信息
class GDXXSpider(scrapy.Spider):
name = 'gdxx_detail'
allowed_domains = ['qcc.com']
start_urls = []

def start_requests(self):
page = 0
pagesize = 200
conn = MysqlUtil()
count = conn.get_count('select * from qcc_company_list')
totalPage = round(count / pagesize)
while page <= totalPage:
result = conn.get_all(
'select detail_url,id,company_name from qcc_company_list limit ' + str(page) + ',' + str(
pagesize * page + pagesize))
page = page + 1
if result:
for su in result:
yield scrapy.Request(su[0], callback=self.parse, meta={'company_id': su[1]})

def parse(self, response):
item = GDXXItem()
trList = response.xpath('//*[@id="partnern"]/div[@id="partnerslist"]/table/tr')
for tr in trList[1:]:
item['company_id'] = response.meta['company_id']
item['gdmc'] = tr.xpath('normalize-space(.//td[2]/table/tr/td[2]/a/span//text())').extract_first()
bl = tr.xpath('.//td[3]')
item['cgbl'] = bl[1].xpath('normalize-space(.//text())').extract_first()
item['rjcze'] = tr.xpath('normalize-space(.//td[4]//text())').extract_first()
item['rjczrq'] = tr.xpath('normalize-space(.//td[5]//text())').extract_first()
yield item


# 主要人员信息
class ZYRYSpider(scrapy.Spider):
name = 'zyry_detail'
allowed_domains = ['qcc.com']
start_urls = []

def start_requests(self):
page = 0
pagesize = 200
conn = MysqlUtil()
count = conn.get_count('select * from qcc_company_list')
totalPage = round(count / pagesize)
while page <= totalPage:
result = conn.get_all(
'select detail_url,id,company_name from qcc_company_list limit ' + str(page) + ',' + str(
pagesize * page + pagesize))
page = page + 1
if result:
for su in result:
yield scrapy.Request(su[0], callback=self.parse, meta={'company_id': su[1]})

def parse(self, response):
item = GDXXItem()
trList = response.xpath('//*[@id="Mainmember"]/div[@id="employeeslist"]/table/tr')
for tr in trList[1:]:
item['company_id'] = response.meta['company_id']
item['name'] = tr.xpath('normalize-space(.//td[2]/a/span//text())').extract_first()
item['job'] = tr.xpath('normalize-space(.//td[2]/a/span//text())').extract_first()
item['cgbl'] = tr.xpath('normalize-space(.//td[2]/a/span//text())').extract_first()
item['sygf'] = tr.xpath('normalize-space(.//td[2]/a/span//text())').extract_first()
python

关于作者

loyal
获得点赞
文章被阅读