欢迎访问 生活随笔!

生活随笔

当前位置: 首页 > 编程资源 > 编程问答 >内容正文

编程问答

Part2-Chapter8-预测乐高玩具套装价格

发布时间:2024/3/26 编程问答 47 豆豆
生活随笔 收集整理的这篇文章主要介绍了 Part2-Chapter8-预测乐高玩具套装价格 小编觉得挺不错的,现在分享给大家,帮大家做个参考.
目标是爬取ebay'上的二手乐高数据,并使用岭回归交叉验证的方式给出回归方程from bs4 import BeautifulSoup import numpy as np import randomdef scrapePage(retX, retY, inFile, yr, numPce, origPrc):# 打开并读取HTML文件with open(inFile, encoding='utf-8') as f:html = f.read()soup = BeautifulSoup(html)i = 1# 根据HTML页面结构进行解析currentRow = soup.find_all('table', r = "%d" % i)while(len(currentRow) != 0):currentRow = soup.find_all('table', r = "%d" % i)title = currentRow[0].find_all('a')[1].textlwrTitle = title.lower()# 查找是否有全新标签if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):newFlag = 1.0else:newFlag = 0.0# 查找是否已经标志出售,我们只收集已出售的数据soldUnicde = currentRow[0].find_all('td')[3].find_all('span')if len(soldUnicde) == 0:print("商品 #%d 没有出售" % i)else:# 解析页面获取当前价格soldPrice = currentRow[0].find_all('td')[4]priceStr = soldPrice.textpriceStr = priceStr.replace('$','')priceStr = priceStr.replace(',','')if len(soldPrice) > 1:priceStr = priceStr.replace('Free shipping', '')sellingPrice = float(priceStr)# 去掉不完整的套装价格if sellingPrice > origPrc * 0.5:print("%d\t%d\t%d\t%f\t%f" % (yr, numPce, newFlag, origPrc, sellingPrice))retX.append([yr, numPce, newFlag, origPrc])retY.append(sellingPrice)i += 1currentRow = soup.find_all('table', r = "%d" % i)#分别抓取各网页数据 def setDataCollect(retX, retY):scrapePage(retX, retY, 'lego8288.html', 2006, 800, 49.99) scrapePage(retX, retY, 'lego10030.html', 2002, 3096, 269.99) scrapePage(retX, retY, 'lego10179.html', 2007, 5195, 499.99) scrapePage(retX, retY, 'lego10181.html', 2007, 3428, 199.99) scrapePage(retX, retY, 'lego10189.html', 2008, 5922, 299.99) scrapePage(retX, retY, 'lego10196.html', 2009, 3263, 249.99)#标准化 def regularize(xMat,yMat):inxMat = xMat.copy()inyMat = yMat.copy()yMean = np.mean(yMat,0)inyMat = yMat - yMeaninMeans = np.mean(inxMat,0)inVar = np.var(inxMat,0)print(inMeans)inxMat = (inxMat - inMeans)/inVar计算平方误差 def rssError(yArr,yHatArr):return ((yArr - yHatArr)**2).sum()#计算回归系数W def standRegres(xArr,yArr):xMat = np.mat(xArr)yMat = np.mat(yArr).TxTx = xMat.T * xMatif np.linalg.det(xTx) == 0.0:print("无法求逆")returnws = xTx.I * (xMat.T * yMat)return ws#交叉验证岭回归 def crossValidation(xArr,yArr,numVal = 10):#得到数据数m = len(yArr)#建索引表indexList = list(range(m))#误差表errorMat = np.zeros((numVal,30))#交叉验证numVal次for i in range(numVal):trainX = []trainY = []testX = []testY = []#将数据“洗牌”random.shuffle(indexList)#划分训练集与测试集for j in range(m):if j < m*0.9:trainX.append(xArr[indexList[j]])trainY.append(yArr[indexList[j]])else:testX.append(xArr[indexList[j]])testY.append(yArr[indexList[j]])wMat = ridgeTest(trainX,trainY)#对每次交叉验证,计算三十个lamda的系数for k in range(30):matTestX = np.mat(testX)matTrainX = np.mat(trainX)meanTrain = np.mean(matTrainX,0)varTrain = np.var(matTrainX,0)matTestX = (matTestX - meanTrain)/varTrainyEst = matTestX * np.mat(wMat[k,:]).T + np.mean(trainY)errorMat[i,k] = rssError(yEst.T.A,np.array(testY))#得到最小误差的系数meanErrors = np.mean(errorMat,0)minMean = float(min(meanErrors)) bestWeights = wMat[np.nonzero(meanErrors == minMean)]xMat = np.mat(xArr)yMat = np.mat(yArr)meanX = np.mean(xMat,0)varX = np.var(xMat,0)#逆标准化数据unReg = bestWeights / varXprint('%f%+f*年份%+f*部件数量%+f*是否全新%+f*原价'%((-1 * np.sum(np.multiply(meanX,unReg))+np.mean(yMat)),unReg[0,0],unReg[0,1],unReg[0,2],unReg[0,3]))#岭回归测试 def ridgeTest(xArr,yArr):xMat = np.mat(xArr)yMat = np.mat(yArr).TyMean = np.mean(yMat,axis = 0)yMat = yMat-yMeanxMeans = np.mean(xMat,axis = 0)xVar = np.var(xMat,axis = 0)xMat = (xMat - xMeans)/xVarnumTestPts = 30wMat = np.zeros((numTestPts,np.shape(xMat)[1]))for i in range(numTestPts):ws = ridgeRegres(xMat,yMat,np.exp(i-10))wMat[i,:] = ws.Treturn wMatif __name__ == "__main__":lgX = []lgY = []setDataCollect(lgX,lgY)crossValidation(lgX,lgY)

总结

以上是生活随笔为你收集整理的Part2-Chapter8-预测乐高玩具套装价格的全部内容,希望文章能够帮你解决所遇到的问题。

如果觉得生活随笔网站内容还不错,欢迎将生活随笔推荐给好友。