当前位置：首页 > 编程资源 > 编程问答 >内容正文

编程问答

爬取所有校园新闻

发布时间：2024/9/5 编程问答 55 豆豆

生活随笔收集整理的这篇文章主要介绍了爬取所有校园新闻小编觉得挺不错的,现在分享给大家,帮大家做个参考.

1.获取单条新闻的#标题#链接#时间#来源#内容 #点击次数，并包装成一个函数。

import requests from bs4 import BeautifulSoup network = 'http://news.gzcc.cn/html/xiaoyuanxinwen/' res = requests.get(network) res.encoding='utf-8' soup = BeautifulSoup(res.text,'html.parser')for news in soup.select('li'):if len(news.select('.news-list-title'))>0:title = news.select('.news-list-title')[0].texturl = news.select('a')[0]['href']time = news.select('.news-list-info')[0].contents[0].textmain = news.select('.news-list-description')[0].textsource = news.select('.news-list-info')[0].contents[1].textprint('链接：{}'.format(url))print('标题：{}'.format(title))print('正文：{}'.format(main))print('时间：{}'.format(time))print('来源：{}'.format(source))res1 = requests.get(url)res1.encoding='utf-8'soup1 = BeautifulSoup(res1.text,'html.parser')passage = soup1.select('.show-content')click = int(requests.get('http://oa.gzcc.cn/api.php?op=count&id=8307&modelid=80').text.split('.')[-1].lstrip("html('").rstrip("');"))print('点击次数：{}'.format(click))break

2.获取一个新闻列表页的所有新闻的上述详情，并包装成一个函数。

import requests from bs4 import BeautifulSoup from datetime import datetime import renetwork = 'http://news.gzcc.cn/html/xiaoyuanxinwen/' res = requests.get(network) res.encoding='utf-8' soup = BeautifulSoup(res.text,'html.parser')def getclick(newsurl):id = re.match('http://news.gzcc.cn/html/2017/xiaoyuanxinwen_(.*).html',newsurl).groups()[0].split('/')[1]clickurl = 'http://oa.gzcc.cn/api.php?op=count&id=8307&modelid=80'.format(id)click = int(requests.get(clickurl).text.split('.')[-1].lstrip("html('").rstrip("');"))return(click)for news in soup.select('li'):if len(news.select('.news-list-title'))>0:title = news.select('.news-list-title')[0].texturl = news.select('a')[0]['href']time = news.select('.news-list-info')[0].contents[0].texttimed = datetime.strptime(time,'%Y-%m-%d')main = news.select('.news-list-description')[0].textsource = news.select('.news-list-info')[0].contents[1].textprint('链接：{}'.format(url))print('标题：{}'.format(title))print('正文：{}'.format(main))print('时间：{}'.format(timed))print('来源：{}'.format(source))res1 = requests.get(url)res1.encoding='utf-8'soup1 = BeautifulSoup(res1.text,'html.parser')passage = soup1.select('.show-content')click = getclick(url)print('点击次数：{}'.format(click))

3.获取所有新闻列表页的网址，调用上述函数。

import requests from bs4 import BeautifulSoup import reurl_main="http://news.gzcc.cn/html/xiaoyuanxinwen/" res = requests.get(url_main) res.encoding = 'utf-8'soup = BeautifulSoup(res.text,'html.parser') li = soup.select('li')def gethits(url_1):li_id =re.search('_.*/(.*).html',url_1).groups(0)[0]hits = requests.get('http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(li_id)).text.split('.')[-1].rstrip('''');''').lstrip(''''html(''')return hitsdef getpageinfo(label):for title_list in label:if len(title_list.select('.news-list-title'))>0:href = title_list.select('a')[0]['href']title = title_list.select('.news-list-title')[0].texttime = title_list.select('span')[0].textinfo = title_list.select('span')[1].textres_list = requests.get(href)res_list.encoding = 'utf-8'soup_list = BeautifulSoup(res_list.text,'html.parser')text_list = soup_list.select('.show-content')[0].texthits_list = gethits(href)getpageinfo(li)pages = int(soup.select('.a1')[0].text.rstrip('条'))//10+1for i in range(2,pages+1):url_page = "http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(i)res_page = requests.get(url_page)res_page.encoding = 'utf-8'soup_page = BeautifulSoup(res_page.text,'html.parser')list_page = soup.select('li')getpageinfo(list_page)print(url_page)

4.完后所有校园新闻爬取工作

import requests from bs4 import BeautifulSoup from datetime import datetime import redef getclick(newsurl):id = re.match('http://news.gzcc.cn/html/2017/xiaoyuanxinwen_(.*).html',newsurl).groups()[0].split('/')[1]clickurl = 'http://oa.gzcc.cn/api.php?op=count&id=8307&modelid=80'.format(id)click = int(requests.get(clickurl).text.split('.')[-1].lstrip("html('").rstrip("');"))return(click)def getonepage(listurl):res = requests.get(listurl)res.encoding='utf-8'soup = BeautifulSoup(res.text,'html.parser')for news in soup.select('li'):if len(news.select('.news-list-title'))>0:title = news.select('.news-list-title')[0].texturl = news.select('a')[0]['href']time = news.select('.news-list-info')[0].contents[0].texttimed = datetime.strptime(time,'%Y-%m-%d')main = news.select('.news-list-description')[0].textsource = news.select('.news-list-info')[0].contents[1].textprint('链接：{}'.format(url))print('标题：{}'.format(title))print('正文：{}'.format(main))print('时间：{}'.format(timed))print('来源：{}'.format(source))res1 = requests.get(url)res1.encoding='utf-8'soup1 = BeautifulSoup(res1.text,'html.parser')click = getclick(url)print('点击次数：{}'.format(click))getonepage('http://news.gzcc.cn/html/xiaoyuanxinwen/index.html')res = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/') res.encoding='utf-8' soup = BeautifulSoup(res.text,'html.parser')page = int(soup.select('.a1')[0].text.rstrip('条'))//10+1 for i in range(2,page+1):listurl='http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)getonepage(listurl)

转载于:https://www.cnblogs.com/sisters/p/7655268.html

总结

以上是生活随笔为你收集整理的爬取所有校园新闻的全部内容，希望文章能够帮你解决所遇到的问题。

如果觉得生活随笔网站内容还不错，欢迎将生活随笔推荐给好友。

上一篇： tsconfig.json配置
下一篇： Educational Codeforc