当前位置：首页 > 编程语言 > python >内容正文

python

网页爬虫python代码_Python 爬虫web网页版程序代码

发布时间：2025/4/5 python 35 豆豆

生活随笔收集整理的这篇文章主要介绍了网页爬虫python代码_Python 爬虫web网页版程序代码小编觉得挺不错的,现在分享给大家,帮大家做个参考.

一：网页结构分析

二：代码实战#! /usr/bin/env python2

# encoding=utf-8

#BeautifulSoup需要安装 MySQLdb

import sys,os,re,hashlib

import urllib

import httplib2

from lxml import etree

import MySQLdb

from BeautifulSoup import BeautifulSoup

import urllib2

import re

import time

reload(sys)

from datetime import datetime as dt,timedelta

import re

h=httplib2.Http(timeout=10)

#设置请求http头模拟伪装浏览器

headers={

'User-Agent':'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'

}

#正则匹配a标签

pattern = '(.*?)'

#日志记录

log_path='./sporttery'

log_file='%s.log' % dt.now().strftime('%Y-%m-%d')

if not os.path.exists(log_path):

os.makedirs(log_path)

log=open('%s/%s' % (log_path,log_file),'w+')

#python操作mysql数据库

conn= MySQLdb.connect(

host='localhost',

port = 3306,

user='root',

passwd='root',

db ='test',

)

conn.set_character_set('utf8')

cur = conn.cursor()

cur.execute('SET NAMES utf8;')

cur.execute('SET CHARACTER SET utf8;')

cur.execute('SET character_set_connection=utf8;')

cur.close()

#获取请求链接内容失败再次执行

def download(url):

fails = 0

while True:

if fails>5:return None

try:

res,content = h.request(url,'GET',headers=headers)

return content.decode('utf-8','ignore')

except:

print(u'打开链接失败'+url)

fails +=1

#字符串截取方法

def GetMiddleStr(content,startStr,endStr):

startIndex = content.index(startStr)

if startIndex>=0:

startIndex += len(startStr)

endIndex = content.index(endStr)

return content[startIndex:endIndex]

def get_ul(data):

mystring=GetMiddleStr(data,'','')

return mystring

def test_sporttery(i):

url='http://www.xxx.com/video/video_%E8%B6%B3%E7%90%83%E9%AD%94%E6%96%B9_'+str(i)+'.html'

print url

#http://www.xxx.com/video/video_%E8%B6%B3%E7%90%83%E9%AD%94%E6%96%B9_2.html

source=download(url)

data=get_ul(source)

datas=data.split('

for each in datas:

ret=re.findall(r"(?<=href=\\").+?(?=\\")|(?<=href=\\').+?(?=\\')" ,each)

for urls in ret:

detial=download(urls)

if detial:

detial_content=GetMiddleStr(detial,'createFlashVideo','m3u8').replace(' ', '')

if detial_content:

end_url_rex=GetMiddleStr(detial_content+".m3u8",'http://','.m3u8')+"m3u8"

#最终的url

#title

sstree = etree.HTML(detial)

ssnodes = sstree.xpath('//*[@id="playVideo"]/div[1]/h2')

for ssn in ssnodes:

name= ssn.text.strip().replace('/h2>', '')

#title=GetMiddleStr(detial,'

').replace(' ', '')

#简介

introduction=GetMiddleStr(detial,'video-info">','').replace(' ', '')

dr = re.compile(r']+>',re.S)

introductions = dr.sub('',introduction)

end_content=introductions.strip().replace('/span>', '')

end_time= time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()+8*60*60))

#end_times=dt.now().strftime('%Y-%m-%d %H:%i:%S')

saveDB(urls,end_url_rex,name,end_content,str(i),end_time)

def saveDB(current_url,end_url_rex,names,end_content,page,create_time):

#添加select update

sql = 'INSERT INTO test.mytables(current_url,end_url_rex,`names`,end_content,page,create_time)\\

VALUES (%s,%s,%s,%s,%s,%s)'

print sql

cur = conn.cursor()

cur.execute(sql,(current_url,end_url_rex,names,end_content,page,create_time))

cur.close()

conn.commit()

if __name__ == '__main__':

first="http://www.xxx.com/video/video_%E8%B6%B3%E7%90%83%E9%AD%94%E6%96%B9_1.html"

url = urllib2.urlopen(first)

content = url.read()

soup = BeautifulSoup(content)

strs=soup.findAll(attrs={"class":"pagination"})

lists=str(strs[0])

listss=re.findall(r'\\d+',lists)

count=len(listss)

list_string = list(set(listss))

str_num= list_string[-1]

i = 1

while i <= int(str_num):

test_sporttery(i)

i += 1

总结

以上是生活随笔为你收集整理的网页爬虫python代码_Python 爬虫web网页版程序代码的全部内容，希望文章能够帮你解决所遇到的问题。

如果觉得生活随笔网站内容还不错，欢迎将生活随笔推荐给好友。

上一篇： java gc日志乱码_6000+字，3
下一篇： python难度如何_入门Python学