使用selenium爬取斗鱼数据

  • 内容
  • 相关

利用python爬取斗鱼数据(需要安装PhantomJs,Chromdriver)


# encoding: utf-8
#!/usr/bin/env python
"""
@file: xc_04_斗鱼爬虫.py
@author: www.xcooo.cn
@Mail: 602006050@qq.com
"""
from selenium import webdriver
from retrying import retry
import time

class DouYu():
    def __init__(self):
        self.start_url = 'https://www.douyu.com/directory/all'
        self.driver = webdriver.Chrome()

    @retry(stop_max_attempt_number=5)  # 尝试多次请求
    def get_content_list(self):
        li_list = self.driver.find_elements_by_xpath("//ul[@class='layout-Cover-list']/li")
        content_list = []
        for li in li_list:
            item = {}
            item['url'] = li.find_element_by_xpath(".//a[@class='DyListCover-wrap']").get_attribute('href')
            item['title'] = li.find_element_by_xpath(".//h3[@class='DyListCover-intro']").get_attribute('title')
            item['categroies'] = li.find_element_by_xpath(".//span[@class='DyListCover-zone']").text
            item['watch_num'] = li.find_element_by_xpath(".//span[@class='DyListCover-hot']").text
            item['anchor'] = li.find_element_by_xpath(".//h2[@class='DyListCover-user']").text
            print(item)
            print('\n')

            with open('douyu.txt', 'a', encoding='utf-8')as f:
                f.write(item['url'])
                f.write('\t')
                f.write(item['title'])
                f.write('\t')
                f.write(item['categroies'])
                f.write('\t')
                f.write(item['watch_num'])
                f.write('\t')
                f.write(item['anchor'])
                f.write('\n')

        # 提取下一页元素
        next_url = self.driver.find_elements_by_xpath("//span[@class='dy-Pagination-item-custom']")
        next_url = next_url[0] if len(next_url)>0 else None

        return next_url


    def run(self):
        # 1. 准备url
        # 2. 发送请求,获取响应
        self.driver.get(self.start_url)

        # 3. 提取数据
        next_url = self.get_content_list()

        # 4. 保存
        # self.save_douyu(content_list)

        # 5. 下一页数据的提取
        while next_url is not None:
            next_url.click()  # 页面没有完全加载完,会报错
            time.sleep(3)
            next_url = self.get_content_list()

if __name__ == '__main__':
    douyu = DouYu()
    douyu.run()


您阅读这篇文章共花了:  

本文标签:

版权声明:转载请带上版权原创为《星城

解压密码:若设有密码均为:www.xcooo.cn

收录状态:百度已收录点击查看详情

使用selenium爬取斗鱼数据

发表评论

您可以选择匿名评论,保护个人隐私 !