浏览 1575 次
|
精华帖 (0) :: 良好帖 (0) :: 新手帖 (0) :: 隐藏帖 (0)
|
|
|---|---|
| 作者 | 正文 |
|
时间:2007-11-07 关键字: C++/CLI, IOCP
http://microsoft.csdn.net/vs/dev%5Fcontest/Ms_rich_Detail.aspx?pointid=102
开发环境:Visual Studio 2005 语言:C++/CLI 作品所要解决的主要问题: 对于给定的网站列表,下载其所有网页到数据库,可以限制抓取深度和吞吐量。 存储的信息包括标题、内容、网页大小、抓取时间、链接数、连接数等等 。 作品的主要应用场景: 可用于数据采集、数据挖掘以及搜索引擎的前期工作。 基于IOCP模型构建,稳定性有保证。 而且有详细的开发文档。 使用说明: 1.直接点击bin目录下的WebSpiderEh.exe,即可开始抓取网站。 2.bin\db.mdb中的sites表配置您要抓取的网站,pages表保存抓取的结果。 3.maxDepth.txt中的数字控制抓取的深度。 4.throughput.txt中的数字控制蜘蛛的吞吐量,一般不用修改,如果您的网速很快,可以将数字调大一点。 谢谢使用,欢迎提出宝贵意见! 如果您下载后觉得好用,或者觉得源码对您有借鉴的价值,请投出您宝贵的一票。 声明:JavaEye文章版权属于作者,受法律保护。没有作者书面许可不得转载。
|
|
| 返回顶楼 | |
|
时间:2007-11-07
// WebSpiderEh.cpp: 主项目文件。
#include "stdafx.h"
#include "InitSock.h"
#include "DataUnit.h"
#include "HTMLDoc.h"
#include "UrlQueue.h"
#include "stdio.h"
#include "algorithm"
#include "map"
#include "string"
#include "iostream"
#define BUFSMALL 1024 //小缓冲区的大小(一次接收)
#define BUFBIG 600000 //大缓冲区的大小(整个网页)
#define PRONUMBER 2 //并发进程数
#define TIME_OUT_TIME 1 //connect超时时间1秒
#define TIMEOUT_IDLE 500 //防死机*秒
#define TIMEOUT_IOCP INFINITE //IOCP查询周期
#define UPDATE_CYCLE 3 //热门关键词更新的周期(天数)
//#define DBG
using namespace System;
using namespace System::Net;
using namespace System::Text;
using namespace System::IO;
using namespace System::Text::RegularExpressions;
using namespace System::Runtime::InteropServices;
using namespace System::Collections::Generic;
using namespace System::Data;
using namespace System::Data::OleDb;
using namespace System::Threading;
using namespace System::Diagnostics;
using namespace std;
char * strtochar(String ^s){//将托管String^转化为Native Char*
return ( char *)Marshal::StringToHGlobalAnsi(s).ToPointer();
}
string Str2str(String^ s){//将托管String^转化为标准string
char *ptr=( char *)Marshal::StringToHGlobalAnsi(s).ToPointer();
ptr[s->Length]='\0';
return string(ptr);
}
String^ strtomd5(String^ text){
return System::Web::Security::FormsAuthentication::HashPasswordForStoringInConfigFile(text,"md5");
}
ref class Global{
public:
//g_visited是全局变量,记录访问过的网页,避免重复访问
static Dictionary<String^,int> g_visited;
//g_titleused是全局变量,记录出现过的标题,以达到标题去重的效果
static Dictionary<String^,int> g_titleused;
//static IndexWriter^ g_writer;
//g_priorities 保存每个网站的优先级
static Dictionary<String^,int> g_priorities;
static OleDbConnection^ g_conn;
static int postAmount=0;
static const int throughput=int::Parse(File::OpenText("throughput.txt")->ReadLine());
static const int max_depth=int::Parse(File::OpenText("maxDepth.txt")->ReadLine());
};
// 初始化Winsock库
CInitSock theSock;
HANDLE hCompletion;//完成端口
//控制IO查询线程是否继续工作
bool g_iogo;
typedef struct _PER_HANDLE_DATA // per-handle数据
{
SOCKET s; // 对应的套节字句柄
sockaddr_in addr; // 服务器地址
} PER_HANDLE_DATA, *PPER_HANDLE_DATA;
typedef struct _PER_IO_DATA // per-I/O数据
{
OVERLAPPED ol; // 重叠结构
char buf[BUFBIG]; // 数据缓冲区
int nOperationType; // 操作类型
char url[BUFSMALL]; //网址
int top; //缓冲区指针
#define OP_READ 1
#define OP_WRITE 2
} PER_IO_DATA, *PPER_IO_DATA;
String^ getDomain(String^ url){
Uri^ uri=gcnew Uri(url);
String^ host=uri->Host;
int i=host->IndexOf(".");
return host->Substring(i+1);
}
void PostGet(String^ url){
try{
String^ urlflag=url->ToLower();
if(Global::g_visited.ContainsKey(urlflag))return ;
if(url->Length>255)return ;
if(url->EndsWith(".zip") || url->EndsWith(".gz") || url->EndsWith(".rar")
|| url->EndsWith(".exe") || url->EndsWith(".exe") || url->EndsWith(".jpg")
|| url->EndsWith(".png") || url->EndsWith(".tar") || url->EndsWith(".chm")
|| url->EndsWith(".iso")|| url->EndsWith(".gif") || url->EndsWith(".csv")
|| url->EndsWith(".pdf") || url->EndsWith(".doc"))
return;
SOCKET cs;
Uri uri(url);
String^ host=uri.Host; //域名
String^ destWeb=uri.PathAndQuery; //绝对URL路径
String^ IP="";
IP=Dns::GetHostEntry(host)->AddressList[0]->ToString(); //IP地址
int port=uri.Port; //端口
SOCKADDR_IN servAddr;
servAddr.sin_family=AF_INET;
servAddr.sin_addr.S_un.S_addr=inet_addr(strtochar(IP));
servAddr.sin_port=htons(port);
cs=::socket(AF_INET,SOCK_STREAM,IPPROTO_TCP);
if(INVALID_SOCKET==cs){
Console::WriteLine(L"创建套接字失败");
return ;
}
if(::connect(cs,(SOCKADDR*)&servAddr,sizeof(SOCKADDR))==SOCKET_ERROR){
Console::WriteLine(WSAGetLastError());
return ;
}
/*int error=-1, len;
len = sizeof(int);
timeval tm;
fd_set set;
unsigned long ul = 1;
ioctlsocket(cs, FIONBIO, &ul); //设置为非阻塞模式
bool ret = false;
if( connect(cs, (struct sockaddr *)&servAddr, sizeof(servAddr)) == -1)
{
tm.tv_sec = TIME_OUT_TIME;
tm.tv_usec = 0;
FD_ZERO(&set);
FD_SET(cs, &set);
if( select(cs+1, NULL, &set, NULL, &tm) > 0)
{
getsockopt(cs, SOL_SOCKET, SO_ERROR, (char *)&error, &len);
if(error == 0) ret = true;
else ret = false;
} else ret = false;
}
else ret = true;
ul = 0;
ioctlsocket(cs, FIONBIO, &ul); //设置为阻塞模式
if(!ret){
closesocket(cs);
return ;
}*/
char getstr[BUFSMALL]={0};
sprintf_s(getstr,"GET %s HTTP/1.0\r\nHost: %s\r\nAccept: */*\r\nUser-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)\r\nReferer: %s\r\nPragma: no-cathe\r\nCathe-Control: no-cathe\r\nConnection: close\r\n\r\n",strtochar(destWeb),strtochar(host),strtochar(url)); //取得HTTP报头
int sendlen=strlen(getstr);
::send(cs,getstr,sendlen,0);
//Console::WriteLine(url);
// 建立新连接之后,为它创建一个per-handle数据,并将它们关联到完成端口对象。
PPER_HANDLE_DATA pPerHandle =
(PPER_HANDLE_DATA)::GlobalAlloc(GPTR, sizeof(PER_HANDLE_DATA));
pPerHandle->s = cs;
memcpy(&pPerHandle->addr, &servAddr, sizeof(servAddr));
::CreateIoCompletionPort((HANDLE)pPerHandle->s, hCompletion, (DWORD)pPerHandle, 0);
// 投递一个接收请求
PPER_IO_DATA pPerIO = (PPER_IO_DATA)::GlobalAlloc(GPTR, sizeof(PER_IO_DATA));
pPerIO->buf[0]='\0';
pPerIO->nOperationType=OP_READ;
pPerIO->top=0;
memcpy(pPerIO->url,strtochar(url),url->Length);
WSABUF buf;
buf.buf=pPerIO->buf;
buf.len=BUFSMALL;
DWORD nFlags=0;
DWORD dwTrans=0;
::WSARecv(pPerHandle->s, &buf, 1, &dwTrans, &nFlags, &pPerIO->ol, NULL);
Global::g_visited[urlflag]=1; //标志为访问过了
Global::postAmount++;
//Console::WriteLine(Global::postAmount);
//Console::WriteLine(UrlQueue::g_urlqueue->size());
//return 1;
}catch(Exception^ ex){
#ifdef DBG
Console::WriteLine(ex->Message);
#endif
delete ex;
//System::GC::Collect();
//System::GC::WaitForPendingFinalizers();
//return 0;
}
}
//addpage把分页好的网页数据暂存到access数据库中去
int addPage(String^ URL,String^ Title,String^ PlainText,String^ LinkText,int Size,String^ Host,int OutDegree,int Priority){
OleDbConnection^ conn=Global::g_conn;
if(conn==nullptr || conn->State!=System::Data::ConnectionState::Open){
String^ connstr="Provider=Microsoft.Jet.OLEDB.4.0;Data Source="+System::AppDomain::CurrentDomain->BaseDirectory+"\\db.mdb";
//Console::WriteLine(connstr);
conn=gcnew OleDbConnection(connstr);
conn->Open();
Global::g_conn=conn;
}
try{
Regex^ reg=gcnew Regex("\\W{1}(?<tt>[0-9a-zA-Z_:,、;《》“”\u4e00-\u9fa5]{10,30})\\W");
String^ tmp="";
if(!Global::g_titleused.ContainsKey(Title)){
Global::g_titleused[Title]=1;
}
else{
Match^ mc=reg->Match(PlainText);
if(mc->Success){
tmp=mc->Groups["tt"]->ToString();
while(Global::g_titleused.ContainsKey(tmp)){
mc=mc->NextMatch();
if(!mc->Success){
tmp=Title;
break;
}
tmp=mc->Groups["tt"]->ToString();
}
if(Title->Equals(tmp))Priority--;
Title=tmp;
Global::g_titleused[Title]=1;
}
}
if(("http://"+Host)->ToLower()==URL->ToLower())Priority+=15;
if(("http://"+Host+"/")->ToLower()==URL->ToLower())Priority+=15;
if(("http://"+Host)->ToLower()==URL->ToLower() && URL->Split('.')->Length<=4)Priority+=15;
if(("http://"+Host+"/")->ToLower()==URL->ToLower() && URL->Split('.')->Length<=4)Priority+=15;
if(("http://www."+getDomain(URL))->ToLower()==URL->ToLower())Priority++;
if(("http://www."+getDomain(URL)+"/")->ToLower()==URL->ToLower())Priority++;
//String^ connstr="provider=microsoft.jet.oledb.4.0;data source=" + AppDomain::CurrentDomain::get()->BaseDirectory + "\\db.mdb";
OleDbDataAdapter^ da=gcnew OleDbDataAdapter("select 1 from pages where [url]='"+URL+"'",conn);
DataSet^ ds=gcnew DataSet();
da->Fill(ds);
if(ds->Tables[0]->Rows->Count>0){
//delete conn;
return 1;
}
StringBuilder^ sb=gcnew StringBuilder();
sb->Append("insert into Pages([URL],[Title],[PlainText],[LinkText],[Size],[Host],[UpdateTime],[OutDegree],[Priority]) values('");
sb->Append(URL)->Append("','");
sb->Append(Title)->Append("','");
sb->Append(PlainText)->Append("','");
sb->Append(LinkText)->Append("',");
sb->Append(Size)->Append(",'");
sb->Append(Host)->Append("','");
sb->Append(DateTime::Now.ToString())->Append("',");
sb->Append(OutDegree.ToString())->Append(",");
sb->Append(Priority.ToString())->Append(")");
OleDbCommand^ cmd=gcnew OleDbCommand();
cmd->Connection=conn;
cmd->CommandText=sb->ToString();
cmd->ExecuteNonQuery();
//conn->Close();
//delete conn;
delete sb;
delete PlainText;
delete URL;
delete Title;
delete LinkText;
delete Host;
/*HANDLE m_hMutex = OpenMutex(
MUTEX_ALL_ACCESS, // request full access
FALSE, // handle not inheritable
L"mymutex"); // object name
if(m_hMutex==NULL){
m_hMutex = CreateMutex(NULL, TRUE, L"mymutex");
}
else{
WaitForSingleObject(m_hMutex,INFINITE);
}
String^ indexpath=AppDomain::CurrentDomain::get()->BaseDirectory + "\\index";
IndexWriter^ writer;
try{
writer=gcnew IndexWriter(indexpath,gcnew Standard::StandardAnalyzer(),false);
}
catch(Exception^ ex){
delete ex;
writer=gcnew IndexWriter(indexpath,gcnew Standard::StandardAnalyzer(),true);
}
IndexReader^ reader=IndexReader::Open(indexpath);
reader->Delete(gcnew Term("url",URL));
reader->Close();
Document^ doc=gcnew Document();
doc->Add(gcnew Field("title",Title,Field::Store::YES,Field::Index::TOKENIZED,Field::TermVector::WITH_POSITIONS_OFFSETS));
doc->Add(gcnew Field("context", PlainText, Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS));
doc->Add(Field::UnIndexed("host", Host));
doc->Add(Field::UnIndexed("page_size", Size.ToString()));
doc->Add(gcnew Field("update_time", DateTime::Now.ToString(),Field::Store::YES,Field::Index::UN_TOKENIZED));
doc->Add(gcnew Field("priority", Priority.ToString(),Field::Store::YES,Field::Index::UN_TOKENIZED));
doc->Add(Field::Keyword("url", URL));
writer->AddDocument(doc);
writer->Close();
::ReleaseMutex(m_hMutex);*/
return 0;
}catch(Exception^ ex){
#ifdef DBG
Console::WriteLine(ex->Message);
#endif
delete ex;
//delete conn;
delete PlainText;
delete URL;
delete Title;
delete LinkText;
delete Host;
//System::GC::Collect();
//System::GC::WaitForPendingFinalizers();
return 1;
}
//System::GC::Collect();
//System::GC::WaitForPendingFinalizers();
}
int WebCut(LPVOID lpParam){ //分解并保存网页的函数
PPER_IO_DATA pPerIO=(PPER_IO_DATA)lpParam;
char* start=NULL; //正文开始的指针
char* ix=NULL,*ix2=NULL; //"charset="这个字符串的位置
char cdbuf[20]={0};//编码方式
int cdbuftop=0;
char key1[]="\r\n\r\n";
char key2[]="charset=";
char key3[]="encoding=";
try{
start=std::search(pPerIO->buf,pPerIO->buf+pPerIO->top,key1,key1+strlen(key1));
if(start==pPerIO->buf+pPerIO->top)start=pPerIO->buf;
ix=std::search(pPerIO->buf,pPerIO->buf+pPerIO->top,key2,key2+strlen(key2));
ix2=std::search(pPerIO->buf,pPerIO->buf+pPerIO->top,key3,key3+strlen(key3));
if(ix!=(pPerIO->buf+pPerIO->top)){
ix+=strlen("charset=");
while(*ix!='"' && *ix!='\n' && *ix!='\r' && cdbuftop<20){
cdbuf[cdbuftop++]=*ix;
ix++;
}
cdbuf[cdbuftop]='\0';
}
else if(ix2!=pPerIO->buf+pPerIO->top){
ix2+=strlen("encoding=");
String^ str=gcnew String(ix2,0,12,Encoding::Default);
Regex reg("(?<CDING>(UTF-8)|(GB2312))",RegexOptions::IgnoreCase);
Match^ mc=reg.Match(str);
String^ tempcd=mc->Groups["CDING"]->ToString();
memcpy(cdbuf,strtochar(tempcd),tempcd->Length);
}
else{
strcpy_s(cdbuf,"GB2312");
}
String^ cding=% String(cdbuf);
if(cding->ToUpper()=="UTF8")cding="UTF-8";
String^ html;
int htmllen=strlen(start);
while(htmllen<BUFBIG-1 && (start[htmllen+1]!='\0' || start[htmllen+2]!='\0')){
//Console::WriteLine(htmllen);
htmllen+=strlen(start+htmllen+1)+1;
if(htmllen>=BUFBIG){
htmllen=BUFBIG;
break;
}
}
//Console::WriteLine(htmllen);
try{
html=gcnew String(start,0,htmllen,Encoding::GetEncoding(cding));//网页正文
}catch(Exception^ ex){
#ifdef DBG
Console::WriteLine(ex->Message);
#endif
delete ex;
//System::GC::Collect();
//System::GC::WaitForPendingFinalizers();
html=gcnew String(start,0,pPerIO->top,Encoding::Default);//网页正文
}
HTMLDoc^ doc=gcnew HTMLDoc(html,%String(pPerIO->url));//网页分析模板
ArrayList^ al=doc->GetInnerLinks(); //获得所有站内链接
for(int i=0;i<al->Count;i++){
String^ urlflag=((String^)al[i])->ToLower() ;
if(!Global::g_visited.ContainsKey(urlflag)){
if(urlflag->EndsWith("/")){
int ix=urlflag->LastIndexOf("/");
if(Global::g_visited.ContainsKey(urlflag->Substring(0,ix)))continue;
}
try{
if(Uri(al[i]->ToString()).Segments->Length<=Global::max_depth){
UrlQueue::g_urlqueue->push(gcnew UrlAtom((String^)al[i]));//放入网址队列
}
}
catch(Exception^ ex){
delete ex;
}
}
}
String^ URL=gcnew String(pPerIO->url);
if(Global::g_priorities.ContainsKey(getDomain(URL))){
addPage(URL,doc->GetTitle(),doc->GetPlainText(),doc->GetLinkText(),doc->GetSize(),Uri(URL).Host,al->Count,Global::g_priorities[getDomain(URL)]);
Console::WriteLine(URL); //打印该完成页的网址
}
else{
addPage(URL,doc->GetTitle(),doc->GetPlainText(),doc->GetLinkText(),doc->GetSize(),Uri(URL).Host,al->Count,0);
Console::WriteLine(URL); //打印该完成页的网址
}
}
catch(Exception^ ex){
#ifdef DBG
Console::WriteLine(ex->Message);
#endif
delete ex;
//System::GC::Collect();
//System::GC::WaitForPendingFinalizers();
}
return 0;
}
//判断主机是否相关
bool hostrelated(String^ a,String^ b){
try{
Match^ mc1=Regex("\\.([^.]+)\\.").Match(a);
Match^ mc2=Regex("\\.([^.]+)\\.").Match(b);
if(mc1->Success && mc2->Success && mc1->Groups->Count>1 && mc2->Groups->Count>1 && mc1->Groups[1]->Value->ToLower()==mc2->Groups[1]->Value->ToLower()){
return true;
}
else
return false;
}
catch(Exception^ ex)
{
#ifdef DBG
Console::WriteLine(ex.Message);
#endif
delete ex;
return false;
}
}
//ServerThread线程函数用于查询和操作完成端口
DWORD WINAPI ServerThread(LPVOID lpParam)
{
// 得到完成端口对象句柄
HANDLE hCompletion = (HANDLE)lpParam;
DWORD dwTrans;
PPER_HANDLE_DATA pPerHandle;
PPER_IO_DATA pPerIO;
while(g_iogo)
{
// 在关联到此完成端口的所有套节字上等待I/O完成
BOOL bOK = ::GetQueuedCompletionStatus(hCompletion,
&dwTrans, (LPDWORD)&pPerHandle, (LPOVERLAPPED*)&pPerIO, TIMEOUT_IOCP);
try{
if(!bOK) // 在此套节字上有错误发生
{
try{
if(pPerHandle!=NULL && pPerHandle->s!=INVALID_SOCKET)
::closesocket(pPerHandle->s);
if(pPerHandle!=NULL)
::GlobalFree(pPerHandle);
if(pPerIO!=NULL)
::GlobalFree(pPerIO);
}catch(Exception^ ex){
#ifdef DBG
Console::WriteLine(ex->Message);
#endif
delete ex;
}
Global::postAmount--;
continue;
}
if(dwTrans == 0 && // 套节字被对方关闭(既网页下载完毕了)
(pPerIO->nOperationType == OP_READ || pPerIO->nOperationType == OP_WRITE))
{
Global::postAmount--;
if(String(pPerIO->buf,0,1024).IndexOf("200 OK")>-1){ //网页存在
WebCut((LPVOID)pPerIO); //处理网页
//printf(pPerIO->buf);
}
try{
if(pPerHandle!=NULL && pPerHandle->s!=INVALID_SOCKET)
::closesocket(pPerHandle->s);
if(pPerHandle!=NULL)
::GlobalFree(pPerHandle);
if(pPerIO!=NULL)
::GlobalFree(pPerIO);
}catch(Exception^ ex){
#ifdef DBG
Console::WriteLine(ex->Message);
#endif
delete ex;
}
continue;
}
//Content-Type:
Regex reg("\\sContent-Type:\\s(?<TYPE>[^\\s]+)\\s");
Regex reg2("\\sLocation:\\s(?<LOCATION>[^\\s]+)\\s");
Match^ mc;
Match^ mc2;
DWORD nFlags=0;
switch(pPerIO->nOperationType) // 通过per-I/O数据中的nOperationType域查看什么I/O请求完成了
{
case OP_READ: // 完成一个接收请求
//pPerIO->buf[dwTrans]='\0';
if(pPerIO->top<=BUFSMALL && pPerIO!=NULL){
mc=reg.Match(% String(pPerIO->buf));
mc2=reg2.Match(% String(pPerIO->buf));
//防止下载非文本文件
if(!mc->Success || (mc->Groups[L"TYPE"]->ToString()->Length>=5 && mc->Groups[L"TYPE"]->ToString()->Substring(0,5)->ToLower()!="text/") || mc->Groups[L"TYPE"]->ToString()->Length<5){
try{
if(pPerHandle!=NULL && pPerHandle->s!=INVALID_SOCKET)
::closesocket(pPerHandle->s);
if(pPerHandle!=NULL)
::GlobalFree(pPerHandle);
if(pPerIO!=NULL)
::GlobalFree(pPerIO);
}catch(Exception^ ex){
#ifdef DBG
Console::WriteLine(ex->Message);
#endif
delete ex;
}
continue;
}
if(mc2->Success){ //如果有重定向
String^ newurl=Uri(% Uri(% String(pPerIO->url)),mc2->Groups[L"LOCATION"]->ToString()).ToString();
String^ url=gcnew String(pPerIO->url);
try{
if(pPerHandle!=NULL && pPerHandle->s!=INVALID_SOCKET)
::closesocket(pPerHandle->s);
if(pPerHandle!=NULL)
::GlobalFree(pPerHandle);
if(pPerIO!=NULL)
::GlobalFree(pPerIO);
}catch(Exception^ ex){
#ifdef DBG
Console::WriteLine(ex->Message);
#endif
delete ex;
}
Uri^ newuri=gcnew Uri(newurl);
if(hostrelated(newuri->Host,Uri(url).Host))
{
if(Global::g_priorities.ContainsKey(getDomain(url)))
Global::g_priorities[getDomain(newurl)]=Global::g_priorities[getDomain(url)];
PostGet(newurl);
continue;
}
}
}
WSABUF buf;
//printf("%d\n",dwTrans);
pPerIO->top+=dwTrans;
if(pPerIO->top>(BUFBIG-1)){//防止网页过大而OverFlow
try{
if(pPerHandle!=NULL && pPerHandle->s!=INVALID_SOCKET)
::closesocket(pPerHandle->s);
if(pPerHandle!=NULL)
::GlobalFree(pPerHandle);
if(pPerIO!=NULL)
::GlobalFree(pPerIO);
}catch(Exception^ ex){
#ifdef DBG
Console::WriteLine(ex->Message);
#endif
delete ex;
}
continue;
}
//pPerIO->buf[pPerIO->top]='\0';
buf.buf=pPerIO->buf+pPerIO->top;
buf.len=BUFSMALL ;
nFlags=0;
::WSARecv(pPerHandle->s, &buf, 1, &dwTrans, &nFlags, &pPerIO->ol, NULL);
break;
case OP_WRITE: // 完成一个发送请求,此处暂时无用
//Sleep(10);
break;
}
}
catch(System::AccessViolationException^ ex){
#ifdef DBG
Console::WriteLine(ex->Message);
#endif
delete ex;
Global::postAmount--;
//System::GC::Collect();
//System::GC::WaitForPendingFinalizers();
}
}
return 0;
}
int main(array<System::String ^> ^args)
{
String^ appPath=AppDomain::CurrentDomain::get()->BaseDirectory;
String^ connstr="provider=microsoft.jet.oledb.4.0;data source=" +appPath+ "\\db.mdb";
//String^ connstr=String^ connstr="Provider=Microsoft.Jet.OLEDB.4.0;Data Source="+System::AppDomain::CurrentDomain->BaseDirectory+"\\db.mdb";
if(args->Length==0){
/*String^ lifePath="c:\\windows\\system32\\host.dll";
bool fExist=System::IO::File::Exists(lifePath);
if(!fExist){
StreamWriter^ sw=File::CreateText(lifePath);
sw->WriteLine("1");
sw->Close();
}
else{
StreamReader^ sr=gcnew StreamReader(lifePath,System::Text::Encoding::Default);
int costTimes=int::Parse(sr->ReadLine());
sr->Close();
if(costTimes==10)
return 0;
Console::WriteLine("还剩{0}次运行的机会。",10-costTimes);
StreamWriter^ sw = gcnew StreamWriter (lifePath,false,System::Text::Encoding::Default);
sw->WriteLine((costTimes+1).ToString());
sw->Close();
}*/
OleDbConnection^ conn=gcnew OleDbConnection(connstr);
//Console::WriteLine("Author:Sunjoy@ICT");
//Console::WriteLine("E-mail:ccnusjy@gmail.com");
Console::WriteLine("正在导入要搜索的站点...");
conn->Open();
//::CreateMutex(NULL,false,L"mymutext");
//删除原有的页(上次爬虫运行的结果)
OleDbCommand^ initcmd=gcnew OleDbCommand("delete from Pages",conn);
try{
initcmd->ExecuteNonQuery();
}catch(Exception^ ex){
delete ex;
}
//更新热门关键词
//initcmd=gcnew OleDbCommand("update KeyWords set rank=0,AddTime=now() where datediff('d',AddTime,now())>"+UPDATE_CYCLE,conn);
//initcmd->ExecuteNonQuery();
OleDbCommand^ cmd=gcnew OleDbCommand("select * from sites order by Priority desc",conn);
OleDbDataReader^ rd=cmd->ExecuteReader();
array<String^> ^params=gcnew array<String^>(PRONUMBER);
int ct=0;
while(rd->Read()){
if(params[ct%PRONUMBER]==nullptr)
params[ct%PRONUMBER]=rd["URL"]->ToString()->Trim();
else
params[ct%PRONUMBER]=params[ct%PRONUMBER]+" "+rd["URL"]->ToString()->Trim();
ct++;
}
for(int j=0;j<PRONUMBER;j++){
if(params[j]!=nullptr){
Process^ p=gcnew Process();
p->StartInfo->FileName=appPath+"\\WebSpiderEH.exe";
p->StartInfo->Arguments=params[j];
Console::WriteLine("{----"+params[j]+"---}");
p->StartInfo->UseShellExecute=false;
p->Start();
}
}
rd->Close();
delete rd;
conn->Close();
delete conn;
//Console::ReadLine();
}
else{
String^ indexpath=AppDomain::CurrentDomain::get()->BaseDirectory + "\\index";
//Console::WriteLine("正在校验本程序的版权。。。");
//else
//Console::WriteLine("=====合法程序=====");
int idletimes=0;
hCompletion = ::CreateIoCompletionPort(INVALID_HANDLE_VALUE, 0, 0, 0);//新建一个完成端口
g_iogo=true;
try{
HANDLE thd=::CreateThread(NULL, 0, ServerThread, (LPVOID)hCompletion, 0, 0);//启动查询完成端口线程
::CloseHandle(thd); //减少引用计数
}catch(Exception^ ex){
delete ex;
#ifdef DBG
Console::WriteLine(ex->Message);
#endif
}
for(int i=0;i<args->Length;i++){
OleDbConnection^ conn=gcnew OleDbConnection(connstr);
conn->Open();
OleDbCommand^ cmd=gcnew OleDbCommand("select Priority from Sites where URL='"+args[i]+"'",conn);
OleDbDataReader^ rd= cmd->ExecuteReader();
if(rd->Read()){
String^ tmpdomain="";
try{
tmpdomain=getDomain(args[i]);
}
catch(Exception^ ex){
delete ex;
Console::WriteLine("不规范或不完整的URL{0},请您检查",args[i]);
continue;
}
Global::g_priorities[tmpdomain]=int::Parse(rd["Priority"]->ToString());
}
UrlQueue::g_urlqueue->push(gcnew UrlAtom(args[i]));
}
while(true)
{
String^ url=UrlQueue::g_urlqueue->pop()->url;
while(url=="" || url==nullptr){
if(idletimes==TIMEOUT_IDLE)
goto theEnd;
else{
Sleep(1000);
idletimes++;
}
url=UrlQueue::g_urlqueue->pop()->url;
}
//Console::WriteLine(Global::throughput);
Regex^ re = gcnew Regex("(?<h>[^\\x00-\\xff]+)");
Match^ mc = re->Match(url);
if (mc->Success)
{
String^ han = mc->Groups["h"]->Value;
url = url->Replace(han, System::Web::HttpUtility::UrlEncode(han, Encoding::GetEncoding("GB2312")));
}
try{
gcnew Uri(url);
}
catch(Exception^ ex){
delete ex;
//Console::WriteLine("不规范的URL:{0},请您检查",url);
continue;
};
DateTime t1=DateTime::Now;
PostGet(url);
DateTime t2=DateTime::Now;
TimeSpan delta=t2-t1;
int maxT=10;
//Console::WriteLine(delta.TotalMilliseconds);
if(delta.TotalMilliseconds<maxT){
Sleep(maxT-(int)delta.TotalMilliseconds);
}
DateTime b1=DateTime::Now;
while(Global::postAmount>Global::throughput){
Sleep(1000);
TimeSpan dlt=DateTime::Now-b1;
if(dlt.TotalMinutes>1)
Global::postAmount/=2;
}
idletimes=0;
}//endwhile
g_iogo=false;
theEnd:
Console::WriteLine("====OVER====");
//::WinExec("tskill WebSpiderEh",SW_HIDE);
}
return 0;
}
|
|
| 返回顶楼 | |
|
时间:2007-11-11
没人感兴趣吗?
看来C++/CLI用的人还是不多,我觉得挺不错的,又可以调Win32 API,又可以用.Net的库。 |
|
| 返回顶楼 | |
|
时间:2007-11-15
我们用VC6过十年了,
但宁愿用C#, 也不想用四不像的C++/CLI |
|
| 返回顶楼 | |
|
时间:2007-11-15
rtdb 写道 我们用VC6过十年了,
你们不愿意用也没人强迫你,我们年轻人愿意就行了,毕竟新的东西还是值得去尝试的。
但宁愿用C#, 也不想用四不像的C++/CLI |
|
| 返回顶楼 | |
|
时间:2007-11-22
fxsjy 写道 rtdb 写道 我们用VC6过十年了,
你们不愿意用也没人强迫你,我们年轻人愿意就行了,毕竟新的东西还是值得去尝试的。但宁愿用C#, 也不想用四不像的C++/CLI 重点是“四不像”,不利于工程化。 C#是全新的,我们不是也转了么。 |
|
| 返回顶楼 | |




