6、通过xpath获取网页数据
生活随笔
收集整理的这篇文章主要介绍了
6、通过xpath获取网页数据
小编觉得挺不错的,现在分享给大家,帮大家做个参考.
1、xpath解析网页源文件
from urllib import request from lxml import etree # 请求的url url = "http://www.dfenqi.cn/Product/Index" # 请求的头文件 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36" } # 创建请求对象 req = request.Request(url,headers = headers) # 创建处理器对象 httpHandler = request.HTTPHandler() # 创建opener opener = request.build_opener(httpHandler) # 发送请求 response = opener.open(req) # 读取源文件 html = response.read().decode('utf-8') # 创建xpath关系 xpath = "//div[@class='liebiao']/ul/li/p/text()" # 获取属性值列表 # xpath = "//div[@class='liebiao']/ul/li/p/@class" # 将html转换成可解析对象 selector = etree.HTML(html) # 返回xpath查询列表 goodsList = selector.xpath(xpath) # 显示商品标题 for goods in goodsList:print(goods)2、xpath解析源文件,并下载图片至本地
from urllib import request from lxml import etree import osclass Spilder():def __init__(self,pageUrl):# 需要爬取网页的urlself.pageUrl = pageUrl# 请求头文件self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}# 请求的处理器self.httpHandler = request.HTTPHandler()# 请求的openerself.opener = request.build_opener(self.httpHandler)def loadPage(self):''' 请求网页 :return: 返回网页源文件 '''req = request.Request(self.pageUrl,headers = self.headers)response = self.opener.open(req)return response.read()def getImageUrls(self,html,xpath):''' 根据xpath解析源文件 :param html: 源文件 :param xpath: xpath解析字符串 :return: 解析列表 '''selector = etree.HTML(html)imgUrls = selector.xpath(xpath)return imgUrlsdef loadImage(self,url):''' 下载图片 :param url: 图片url :return: 返回图片数据 '''req = request.Request(url,headers=self.headers)response = self.opener.open(req)return response.read()def writeImage(self,img,imgName):''' 在当前文件夹下面创建image子文件夹,将图片写入本地, :param img: 图片数据 :param imgName: 图片名称 :return: '''folderName = os.path.join(os.path.abspath(os.curdir),"image")if not(os.path.isdir(folderName)):os.mkdir(folderName)with open('image/%s' % imgName,'wb') as f:f.write(img)if __name__ == "__main__":url = "http://www.dfenqi.cn/Product/Index"spilder = Spilder(url)html = spilder.loadPage()xpath = "//div[@class='liebiao']/ul/li/div/a/img/@src"imgUrls = spilder.getImageUrls(html,xpath)index = 0for url in imgUrls:index += 1img = spilder.loadImage(url)spilder.writeImage(img,'img%s.jpg' % index)转载于:https://www.cnblogs.com/toloy/p/8618007.html
总结
以上是生活随笔为你收集整理的6、通过xpath获取网页数据的全部内容,希望文章能够帮你解决所遇到的问题。
- 上一篇: 格式工厂软件处理视频
- 下一篇: Swoole练习 Web