果壳网url地址抓取

  • 内容
  • 相关

关于果壳网首页url地址提取 - python爬虫


# encoding: utf-8
#!/usr/bin/env python
"""
@file: xc_03_果壳网url提取.py
@author: www.xcooo.cn
@Mail: 602006050@qq.com
"""
import requests
import re

class GuoKr():
    def __init__(self):
        self.url_temp = "https://www.guokr.com/ask/highlight/?page={}"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"}

    # 构造url列表
    def get_url_list(self):
        return [self.url_temp.format(i * 1) for i in range(1,101)]

    # 发送请求,获取响应
    def parse_url(self, url):
        print(url)
        response = requests.get(url, headers=self.headers)
        return response.content.decode()

    # 匹配url地址
    def re_html_str(self, html_str):
        content_list = re.findall(r'<h2><a target="_blank" href="(https://www.guokr.com/question/\d+/)">(.*?)</a></h2>',html_str,re.S)
        return content_list

    def save_content(self,content_list):
        with open('guokr.txt', 'a', encoding='utf-8')as file:
            for data in content_list:
                print(data)
                file.write(data[0])
                file.write('\t')
                file.write(data[1])
                file.write('\n')
        print('保存成功')

    def run(self):
        # 1. 构造url列表
        url_list = self.get_url_list()

        # # 2. 发送请求,获取响应
        for url in url_list:
            html_str = self.parse_url(url)

            # 3. 匹配
            content_list = self.re_html_str(html_str)

            # 保存url地址
            self.save_content(content_list)


if __name__ == '__main__':
    guokr = GuoKr()
    guokr.run()


您阅读这篇文章共花了:  

本文标签:

版权声明:转载请带上版权原创为《星城

解压密码:若设有密码均为:www.xcooo.cn

收录状态:百度已收录点击查看详情

果壳网url地址抓取

发表评论

您可以选择匿名评论,保护个人隐私 !