天网搜索TSE部分源码分析-url.cpp
//根据一个给定的URL,组成消息体,发送给该URL指向的服务器。//为此,定义Url类//url.cpp#include<iostream>#include <string>#include <sys/socket.h>#include <netdb.h>#include "
//根据一个给定的URL,组成消息体,发送给该URL指向的服务器。
#include<iostream>
#include <string>
#include <sys/socket.h>
#include <netdb.h>
#include "Tse.h"
#include "Url.h"
#include "Http.h"
#include "Md5.h"
#include "StrFun.h"
//对网址的"."分隔符进行判断,主要分析是否以"."或".."结束
#define DOTP(x) ((*(x) == '.') &&(!*(x + 1)))
#define DDOTP(x) ((*(x) == '.') &&(*(x + 1) == '.') && (!*(x +2)))
map<string,string>mapCacheHostLookup;
extern vector<string>vsUnreachHost;
pthread_mutex_t mutexCacheHost = PTHREAD_MUTEX_INITIALIZER;
extern set<string>setVisitedUrlMD5;
extern map<unsigned long,unsignedlong> mapIpBlock;
typedefmap<string,string>::value_typevalTypeCHL;
//定义连接类型结构
struct scheme_data
{
char *leading_string;//连接头字符
int default_port;//默认端口
int enabled;//允许连接否
};
//所有连接情况的定义
static struct scheme_data supported_schemes[] =
{
{ "http://",DEFAULT_HTTP_PORT, 1 },
{ "ftp://",DEFAULT_FTP_PORT, 1},
{NULL,-1,0 }
};
//分析并填充连接类型
void CUrl::ParseScheme (const char *url)
{
int i;
for (i = 0;supported_schemes[i].leading_string; i++)
if (0 ==strncasecmp (url, supported_schemes[i].leading_string,
strlen (supported_schemes[i].leading_string))) {
//判断连接类型,并更新m_eScheme成员
if (supported_schemes[i].enabled){
this->m_eScheme= (enum url_scheme) i;
return;
}else{
this->m_eScheme= SCHEME_INVALID;
return;
}
}
this->m_eScheme= SCHEME_INVALID;
return;
}
bool CUrl::ParseUrlEx(string strUrl)
{
char protocol[10];
char host[HOST_LEN];
char request[256];
int port = -1;
//初始化相关填充区域
memset( protocol, 0,sizeof(protocol) );
memset( host, 0, sizeof(host) );
memset( request, 0, sizeof(request) );
//分析并填充连接类型
this->ParseScheme(strUrl.c_str());
//如果非HTTP类型,返回FALSE
if( this->m_eScheme != SCHEME_HTTP){
return false;
}
//如果是HTTP类型,ParseUrlEx继续处理,注意每个参数都有传送它的长度
ParseUrlEx(strUrl.c_str(),
protocol,sizeof(protocol),
host,sizeof(host),
request,sizeof(request),
&port);
//将取出后的URL的相关信息写入成员变量中
m_sUrl =strUrl;
m_sHost = host;
m_sPath = request;
if( port > 0){
m_nPort = port;
}
return true;
}
//对URL进一步处理
void CUrl::ParseUrlEx(const char *url,
char *protocol, intlprotocol,
char *host, int lhost,
char *request, intlrequest,
int *port)
{
char *work,*ptr,*ptr2;
*protocol = *host = *request =0;
*port = 80;
//准备一个临时缓冲区WORK
int len = strlen(url);
//pthread_mutex_lock(&mutexMemory);
work = new char[len + 1];
//pthread_mutex_unlock(&mutexMemory);
memset(work, 0, len+1);
strncpy(work, url, len);
//依据":"找到协议,如果URL内有协议类型,则写入protocol,否则默认为HTTP
// find protocol if any
ptr = strchr(work, ':');
if( ptr != NULL ){
*(ptr++) = 0;//以免lprotocol比work中的协议字符的长度还要长
strncpy( protocol, work,lprotocol );
} else
{//有些URL省略了HTTP的前缀
strncpy( protocol, "HTTP",lprotocol );
ptr = work;
}
//跳过"//"
// skip past opening /'s
if( (*ptr=='/')&& (*(ptr+1)=='/') )
ptr+=2;
//使用ptr2处理主机
// find host
ptr2 = ptr;
while( IsValidHostChar(*ptr2)&& *ptr2 )
ptr2++;
*ptr2 = 0;//保证复制主机字符的正确性和安全性
strncpy( host, ptr, lhost );
//处理请求部分的字符
// find the request
int offset = ptr2 - work;
const char *pStr = url + offset;
//PSTR指向请求部分的起始部分通过offset偏移地址
strncpy( request, pStr, lrequest );
//处理端口号
// find the port number, ifany
ptr = strchr( host, ':' );
if( ptr != NULL ){
*ptr = 0;
*port = atoi(ptr+1);
}
//pthread_mutex_lock(&mutexMemory);
delete [] work;
//pthread_mutex_unlock(&mutexMemory);
work = NULL;
}
CUrl::CUrl()
{
//不带参数的构造函数初始化类成员
this->m_sUrl = "";
this->m_eScheme=SCHEME_INVALID;
this->m_sHost ="";
this->m_nPort =DEFAULT_HTTP_PORT;
this->m_sPath = "";
}
CUrl::~CUrl()
{
}
//通过主机字符得到IP地址
char * CUrl::GetIpByHost(const char *host)
{
if( !host ){// nullpointer
return NULL;
}
if( !IsValidHost(host) ){//invalid host
return NULL;
}
unsigned long inaddr = 0;
char *result = NULL;
int len = 0;
//主机转32位二进制数的IP地址
inaddr = (unsigned long)inet_addr( host );
//if ( (int)inaddr != -1){
if ( inaddr != INADDR_NONE){ // host is justip
//===主机名如果为XX.XX.XX.XX形式的IP地址字符,则inet_addr执行成功
len = strlen(host);
//pthread_mutex_lock(&mutexMemory);
result = new char[len+1];
//pthread_mutex_unlock(&mutexMemory);
memset(result, 0, len+1);
memcpy(result, host, len);
return result;
}
else
{
//firt find from cache
//==主机名字符为非IP形式
//Cache暂存的内存以主机名为索引查找IP地址
map<string,string>::iteratorit = mapCacheHostLookup.find(host);
if( it !=mapCacheHostLookup.end() ){// find in host lookupcache
const char *strHostIp;
strHostIp= (*it).second.c_str();
inaddr =(unsigned long)inet_addr( strHostIp );
//if ((int)inaddr != -1){
if ( inaddr!= INADDR_NONE ){
//成功在Cache中找到主机名对应的IP地址
len= strlen(strHostIp);
//pthread_mutex_lock(&mutexMemory);
result= new char[len+1];
//pthread_mutex_unlock(&mutexMemory);
memset(result, 0, len+1 );
memcpy(result, strHostIp, len );
//cout<< ":)" ;
returnresult;
}
}
}
//均未找到,只能求帮助于DNS服务
// if still not find, then try by DNSserver
struct hostent *hp;
hp =gethostbyname(host);//通过主机名获得IP地址
if(hp == NULL) {
//cout<< "gethostbyname() error inGetIpByHost: " << host<< endl;
return NULL;
}
// cache host lookup
//in为32位的IP地址结构变量
struct in_addr in;
bcopy(*(hp->h_addr_list),(caddr_t)&in, hp->h_length);
//inet_ntoa()是对structin_addr*结构转变成可以答应的ip的点进字
//符串,inet_ntop也是同样的功能,不过inet_ntop是使用于ipv4,ipv6
//而inet_ntoa只能用于ipv4
//AF_INET表示为IP地址形式即INTERNET地址家族
charabuf[INET_ADDRSTRLEN];
//将IN中的地址转成带点的IP地址字符形式
if( inet_ntop(AF_INET, (void *)&in,abuf,sizeof(abuf)) == NULL ){
cout<< "inet_ntop() return error inGetIpByHost" << endl;
return NULL;
} else {
//可以成功转化成字符形式的IP写入abuf
pthread_mutex_lock(&mutexCacheHost);
//if(mapCacheHostLookup.count(host) == 0){
//更新CACHE中的主机和IP地址对
if( mapCacheHostLookup.find(host) == mapCacheHostLookup.end()){
//cout<< endl<< host<< " and "<< abuf<< endl;
mapCacheHostLookup.insert(valTypeCHL ( host, abuf));
}
pthread_mutex_unlock(&mutexCacheHost);
}
// return result
len = strlen(abuf);
//pthread_mutex_lock(&mutexMemory);
result = new char[len + 1];
//pthread_mutex_unlock(&mutexMemory);
memset( result, 0, len+1 );
memcpy( result, abuf, len );
return result;
}
bool CUrl::IsValidHostChar(char ch)
{
//所有数字,字母,-,.,:,_为主机字符可接受部分
return( isalpha(ch) || isdigit(ch)
|| ch=='-' || ch=='.' ||ch==':' || ch=='_');
}
bool CUrl::IsValidHost(const char *host)
{
if( !host ){
return false;
}
if( strlen(host) < 6 ){ // incase host like "www", "pku", etc.
return false;
}
char ch;
for(unsigned int i=0;i<strlen(host); i++){
ch = *(host++);
if( !IsValidHostChar(ch)){
returnfalse;
}
}
return true;
}
bool CUrl::IsVisitedUrl(const char *url)
{
if( !url ){
return true; // if be null, wethink it have been visited
}
CMD5 iMD5;
iMD5.GenerateMD5( (unsigned char*)url,strlen(url) );
string strDigest = iMD5.ToString();
if( setVisitedUrlMD5.find(strDigest) !=setVisitedUrlMD5.end() ) {
return true;
} else {
return false;
}
}
bool CUrl::IsValidIp(const char *ip)
{
if( ip == NULL ){
return false;
}
unsigned long inaddr = (unsignedlong)inet_addr(ip);
if( inaddr == INADDR_NONE ){//invalid ip
return false;
}
if( mapIpBlock.size() > 0){
map<unsignedlong,unsigned long>::iterator pos;
for(pos=mapIpBlock.begin();pos!=mapIpBlock.end(); ++pos){
unsigned longret;
ret =inaddr & ~((*pos).second);
if( ret ==(*pos).first ){// inside
returntrue;
}
}
// outside
return false;
}
// if block range is not given, we think itinside also
return true;
}
bool CUrl::IsForeignHost(string host)
{
if( host.empty() ) return true;
if( host.size() > HOST_LEN )return true;
unsigned long inaddr = 0;
inaddr = (unsigned long)inet_addr(host.c_str() );
if ( inaddr != INADDR_NONE){ // host is justip
return false;
}
string::size_type idx = host.rfind('.');
string tmp;
if( idx != string::npos ){
tmp = host.substr(idx+1);
}
CStrFun::Str2Lower( tmp, tmp.size() );
const char *home_host[] ={
"cn","com","net","org","info",
"biz","tv","cc", "hk","tw"
};
int home_host_num = 10;
for(int i=0; i<home_host_num;i++){
if( tmp == home_host[i] )
returnfalse;
}
return true;
}
bool CUrl::IsImageUrl(string url)
{
if( url.empty() ) return false;
if( url.size() > HOST_LEN ) returnfalse;
string::size_type idx = url.rfind('.');
string tmp;
if( idx != string::npos ){
tmp = url.substr(idx+1);
}
CStrFun::Str2Lower( tmp, tmp.size() );
const char *image_type[] ={
"gif","jpg","jpeg","png","bmp",
"tif","psd"
};
int image_type_num = 7;
for (int i=0; i<image_type_num;i++)
{
if( tmp == image_type[i])
returntrue;
}
return false;
}
更多推荐
所有评论(0)