本文共 1844 字,大约阅读时间需要 6 分钟。
闲来无事,找点段子一乐呵,就逛到糗事百科,这次爬取没有什么难度,唯一值得说道的是增加了一点点的代码健壮性。
import requestsfrom lxml import etreeclass Spider(): def __get_page(self,url,headers): try: response = requests.get(url,headers=headers) if response.status_code == 200: return response.text else: return None except Exception: return None def __parse_page(self,html): results = [] data = etree.HTML(html) items = data.xpath('//div[@id="content-left"]/div') for item in items: #获取作者 author = item.xpath('./div[1]/a[2]/h2/text()') if author: results.append(author[0].strip()) else: results.append('匿名用户') #获取内容 content = item.xpath('./a[1]/div/span/text()') if content: results.append(''.join(content).replace('\n','')) else: results.append('此用户没有内容') #获取好笑数 number = item.xpath('./div[2]/span[1]/i/text()') if number: results.append(number[0]) else: results.append('0') return results def __save_to_txt(self,data): with open('data.txt','w',encoding='utf-8') as f: f.write(data) def run(self): for i in range(1,13): url ='https://www.qiushibaike.com/text/page/' + str(i) headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)' ' AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } html = self.get_page(url,headers) result = self.parse_page(html) self.save_to_txt(str(result))#实例化类spider = Spider()spider.run()
转载地址:http://sbell.baihongyu.com/