python抓取头条文章
生活随笔
收集整理的这篇文章主要介绍了
python抓取头条文章
小编觉得挺不错的,现在分享给大家,帮大家做个参考.
python抓取头条美文并存储到mongodb
# Author:song from multiprocessing import Pool from urllib.parse import urlencode import requests import json from requests import RequestException from bs4 import BeautifulSoup import re import pymongo client = pymongo.MongoClient('localhost',connect=False) db = client['toutiaowenzhang']def get_index(offset):data = {'offset': offset,'format': 'json','keyword': '美文','autoload': 'true','count': 20,'cur_tab': 1,'from':'search_tab'}url = 'https://www.toutiao.com/search_content/?'+urlencode(data)response = requests.get(url)try:if response.status_code == 200:return response.textelse:return Noneexcept RequestException:return Nonedef get_urls(html):data = json.loads(html)if data and 'data' in data.keys():for item in data.get('data'):yield item.get('article_url')def get_index_detail(url):response = requests.get(url)try:if response.status_code == 200:return response.textelse:return Noneexcept RequestException:return Nonedef parse_detail(html):try:soup = BeautifulSoup(html,'lxml')title = soup.select('title')[0].get_text()compile_allarticle= re.compile('content.*?<div>(.*?)</div>',re.S)allarticle = re.findall(compile_allarticle,html)# article =re.sub('(<.*?<span>)','',allarticle[0])#正则匹配上不需要的那部分article =re.sub('[a-zA-Z0-9/#;&\._]','',str(allarticle)).strip()#直接把字母数字全部替换data = {'title':title,'article':article}return dataexcept TypeError:#解决出现了404界面pass def save_to_mongodb(result):if db['toutiaowenzhang'].insert(result):print('successful')else:print('fail')def main(offset):html = get_index(offset)items = get_urls(html)for item in items:if item:ab = get_index_detail(item)result = parse_detail(ab)save_to_mongodb(result) if __name__=='__main__':groups = [x*20 for x in range(3)]pool = Pool()pool.map(main,groups)
转载于:https://www.cnblogs.com/master-song/p/8922850.html
总结
以上是生活随笔为你收集整理的python抓取头条文章的全部内容,希望文章能够帮你解决所遇到的问题。
- 上一篇: MySQL数据类型(最大值 和 最小值)
- 下一篇: python中的‘/’和'//'