C++爬虫项目爬取图片
生活随笔
收集整理的这篇文章主要介绍了
C++爬虫项目爬取图片
小编觉得挺不错的,现在分享给大家,帮大家做个参考.
C++爬虫项目爬取图片,
值得注意的是有些网站的图片爬不来的,有反爬机制,所以一般人爬不下来.
主要代码文件
main.cpp文件里面的代码
CHttp.h 文件里面的代码
#include<iostream> #include<windows.h> #include<string> #include<queue> //#include<WinSock2.h>在windows里边 using namespace std;#pragma comment(lib,"ws2_32.lib")//网络的库class CHttp { private:string m_host;string m_object;SOCKET m_socket;bool AnalyseUrl(string url);//解析URL\httpbool AnalyseUrl2(string url);//\httpsbool init();//初始化套接字bool Connect();//连接web服务器 public:CHttp(void);~CHttp(void);string FetchGet(string url);//通过Get方式获取网页void AnalyseHtml(string html);//解析网页,获得图片地址和其他的链接 };CHttp.cpp 实现的文件的代码是
#include "CHttp.h"CHttp::CHttp(void) {}CHttp::~CHttp(void) {closesocket(m_socket);WSACleanup(); }//解析URL\http bool CHttp::AnalyseUrl(string url) {if (string::npos == url.find("http://"))return false;if (url.length() <= 7)return false;int pos = url.find('/', 7);if (pos == string::npos){m_host = url.substr(7);m_object = '/';}else{m_host = url.substr(7, pos - 7);m_object = url.substr(pos);}if (m_host.empty())return false;return true; }//解析URL\https bool CHttp::AnalyseUrl2(string url) {if (string::npos == url.find("https://"))return false;if (url.length() <= 8)return false;int pos = url.find('/', 8);if (pos == string::npos){m_host = url.substr(8);m_object = '/';}else{m_host = url.substr(8, pos - 8);m_object = url.substr(pos);}if (m_host.empty())return false;return true; }bool CHttp::init() {//1 请求协议版本WSADATA wsaData;WSAStartup(MAKEWORD(2, 2), &wsaData);if (LOBYTE(wsaData.wVersion) != 2 ||HIBYTE(wsaData.wVersion) != 2) {printf("请求协议版本失败!\n");return false;}//printf("请求协议成功!\n");//2 创建socketm_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);if (SOCKET_ERROR == m_socket) {printf("创建socket失败!\n");WSACleanup();return false;}//printf("创建socket成功!\n");return true; }//连接web服务器 bool CHttp::Connect() {//DNS服务器:将域名解析成IP地址hostent *p = gethostbyname(m_host.c_str());if (p == NULL)return false;SOCKADDR_IN sa;sa.sin_family = AF_INET;sa.sin_port = htons(80);//http的默认端口,https的默认端口443memcpy(&sa.sin_addr, p->h_addr, 4);if (-1 == connect(m_socket, (SOCKADDR*)&sa, sizeof(sa))){cout << "服务器连接失败" << endl;return false;}else{//cout<<"服务器连接成功"<<endl;return true;} }string CHttp::FetchGet(string url)//通过Get方式获取网页 {string html;//解析urlif (false == AnalyseUrl(url)){if (false == AnalyseUrl2(url)){cout << "Html解析失败" << endl;return "";}}//cout<<"主机名"<<m_host<<"\t\t"<<"资源名"<<m_object<<endl;if (false == init())//初始化套接字{return "";}if (false == Connect())//连接服务器{return "";}//发送Get请求 Get请求数据string request = "GET " + m_object +" HTTP/1.1\r\nHost:" + m_host +"\r\nConnection: Close\r\n\r\n";if (SOCKET_ERROR == send(m_socket, request.c_str(), request.size(), 0)){cout << "send request error" << endl;closesocket(m_socket);return "";}//接收数据char ch;while (recv(m_socket, &ch, 1, 0)){html += ch;}return html; } //判断是否以什么结尾 bool hasEnding(char *& strFull, char*& strEnd) {char * pFull = strFull;while (*pFull != 0)pFull++;char * pEnd = strEnd;while (*pEnd != 0)pEnd++;while (1){pFull--;pEnd--;if (*pEnd == 0){break;}if (*pFull != *pEnd){return false;}}return true; } void CHttp::AnalyseHtml(string html)//解析网页,获得图片地址和其他的链接 {int startIndex = 0;int endIndex = 0;//找到所有的图片for (int pos = 0;pos < html.length();){startIndex = html.find("src=\"", startIndex);if (startIndex == -1){break;}startIndex += 5;endIndex = html.find("\"", startIndex);//找到资源链接string src = html.substr(startIndex, endIndex - startIndex);char *src1 = (char *)src.c_str();//cout<<src<<endl;//判断连接是否是想要的资源 // char *strend = ".jpg";// char* strend = new char[20];// strcpy(strend, ".jpg");char* strend = new char[20]{ ".jpg" };if (hasEnding(src1, strend) == true){/*if(-1!=src.find("t_s960x600c5"))*/if (-1 != src.find("t_s1920x1080c5")){cout << src << endl;//新建一个线程来下载图片extern queue<string> p;p.push(src);extern void loadImage();CreateThread(NULL, NULL, (LPTHREAD_START_ROUTINE)loadImage,NULL, NULL, NULL);}/*system("pause");*/}startIndex = endIndex + 1;//system("pause");}startIndex = 0;//找到其他URL地址for (int pos = 0;pos < html.length();){startIndex = html.find("href=\"", startIndex);if (startIndex == -1){break;}startIndex += 6;endIndex = html.find("\"", startIndex);//找到资源链接string src = html.substr(startIndex, endIndex - startIndex);char *src1 = (char *)src.c_str();//cout<<src<<endl;//判断连接是否是想要的资源//char *strend = ".html";//char* strend = new char[20];//strcpy(strend, ".html");char* strend = new char[100]{ ".html" };if (hasEnding(src1, strend) == true){if ((-1 != src.find("bizhi") || -1 != src.find("showpic")) && -1 == src.find("http://")){string url = "http://desk.zol.com.cn" + src;extern queue<string> q;q.push(url);//cout<<url<<endl;}}startIndex = endIndex + 1;//system("pause");}}总结
以上是生活随笔为你收集整理的C++爬虫项目爬取图片的全部内容,希望文章能够帮你解决所遇到的问题。
- 上一篇: MFC之实现鼠标自动左击,频率可调,支持
- 下一篇: C++实现黑客帝国流星雨效果