mongona

mongona
-- --
正在获取天气

简单爬取新浪新闻数据

# -*- coding: utf-8 -*-
"""
@author: sato
@file: sina_spider.py
@time: 2019-09-03 15:57

"""
import requests
import re
import multiprocessing
import os


class Spider(object):

    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/76.0.3809.132 Safari/537.36'
    }
        # 新浪新闻首页 每日要闻、重点新闻 id="wrap" >  class="part_01 clearfix" >  class="p_middle"
        self.base_url = 'https://news.sina.com.cn/'

    def get_news_list(self):
        res = requests.get(self.base_url, self.headers)
        if res.status_code not in (200, 201):
            raise Exception('network error!')
        res.encoding = 'utf-8'
        part_01_clearfix = re.findall('<div class="part_01 clearfix">([\S\s]*?)<div class="part_01 clearfix" data-sudaclick="blk_livevideo">', res.text)
        if part_01_clearfix:
            part_01_clearfix = part_01_clearfix[0]
        p_middle = re.findall('<div class="p_middle">([\S\s]*?)<div class="p_right">', part_01_clearfix)
        if p_middle:
            return re.findall('<a target="_blank" href="([\S\s]*?)"', p_middle[0])

    def rep_and_write(self, link):
        print(f'get data from {link}')
        ret = requests.get(url=link, headers=self.headers)
        if ret.status_code not in (200, 201):
            raise Exception(f'get {link} error!')
        ret.encoding = 'utf-8'
        content = ret.text
        title = re.findall('<title>([\S\s]*?)</title>', content)[0]
        with open(os.path.join('./html', title + '.html'), 'w') as f:
            f.write(content)

    def run(self):
        links = self.get_news_list()
        if not links:
            raise Exception('error!')
        if not os.path.exists('./html'):
            os.mkdir('./html')
        pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
        for link in links:
            pool.apply_async(self.rep_and_write, (link,))
        pool.close()
        pool.join()
        print('done')


spider = Spider()
spider.run()

 

0
3
Tags
富强,民主,文明,和谐,自由,平等,公正,法治,爱国,敬业,诚信,友善。
打赏二维码
About
Sato
毕竟,代码只是思想的一种体现而已!!! 架构师就像军师,不是对面啥阵势都用大军队来干,小阵势小技术,小公司不必要也不用引入分布式
Category
Tags
Site statistics

本站现有文章18篇,共被浏览5129

本次响应耗时: 0.054s

当前来路IP: 3.229.122.219  美国

您是本站第: 34573 位访客!

本站已苟活: 

All hots
Article archiving