当前位置: 首页 > 图文教程 > 网络编程 > ASP > 自己做采集程序

ASP
ASP基础讲座(下)
解决IIS5 HTTP500内部错误
ASP 3.0高级编程(四十六)
ASP 3.0高级编程(四十五)
ASP 3.0高级编程(四十四)
ASP 3.0高级编程(四十三)
ASP 3.0高级编程(四十二)
ASP 3.0高级编程(四十一)
ASP 3.0高级编程(三十九)
ASP 3.0高级编程(三十八)
ASP 3.0高级编程(三十七)
ASP 3.0高级编程(三十六)
ASP 3.0高级编程(三十五)
ASP 3.0高级编程(三十四)
ASP 3.0高级编程(三十三)
ASP 3.0高级编程(三十二)
ASP 3.0高级编程(三十一)
ASP错误代码说明
jscript错误代码及相应解释大全
ASP错误处理

ASP 中的 自己做采集程序


出处:互联网   整理: 软晨网(RuanChen.com)   发布: 2009-09-13   浏览: 56 ::
收藏到网摘: n/a

现在网上的采集程序很多,但是有时候你发现一个好的网站,想自己做个采集工具采集一些信息,就需要自己去写程序了,其实这样的采集程序并不难写,主要是去分析源网站的网页结构。
首先去下载个XMLHTTP的类文件:
<%
Class xhttp
private cset,sUrl,sError
Private Sub Class_Initialize()
'cset="UTF-8"
cset="GB2312"
sError=""
end sub

Private Sub Class_Terminate()
End Sub

Public Property LET URL(theurl)
sUrl=theurl
end property
public property GET BasePath()
BasePath=mid(sUrl,1,InStrRev(sUrl,"/")-1)
end property
public property GET FileName()
FileName=mid(sUrl,InStrRev(sUrl,"/")+1)
end property
public property GET Html()
Html=BytesToBstr(getBody(sUrl))
end property

public property GET xhttpError()
xhttpError=sError
end property

private Function BytesToBstr(body)
on error resume next
'Cset:GB2312 UTF-8
dim objstream
set objstream = Server.CreateObject("adodb.stream")
with objstream
.Type = 1 '
.Mode = 3 '
.Open
.Write body '
.Position = 0 '
.Type = 2 '
.Charset = Cset '
BytesToBstr = .ReadText '
.Close
end with
set objstream = nothing
End Function

private function getBody(surl)
on error resume next
dim xmlHttp
'Set xmlHttp=server.createobject("Msxml2.XMLHTTP.4.0")
'set xmlHttp=server.createobject("Microsoft.XMLHTTP")
set xmlHttp=server.createobject("MSXML2.ServerXMLHTTP")
xmlHttp.setTimeouts 10000,10000,10000,30000
xmlHttp.open "GET",surl,false
xmlHttp.send
if xmlHttp.readystate=4 then
'if xmlHttp.status=200 then
getBody=xmlhttp.responsebody
'end if
else
getBody=""
end if

if Err.Number<>0 then
sError=Err.Number
Err.clear
else
sError=""
end if
set xmlHttp=nothing
end function

Public function saveimage(tofile,isoverwrite)
on error resume next
dim objStream,objFSO,imgs

if Not isoverwrite Then
Set objFSO = Server.CreateObject("Scripting.FileSystemObject")
If objFSO.FileExists(Server.MapPath(tofile)) Then
Exit Function
End If
Set objFSO = Nothing
End IF

imgs=getBody(sUrl)
Set objStream = Server.CreateObject("ADODB.Stream")
with objStream
.Type =1
.Open
.write imgs
.SaveToFile server.mappath(tofile),2
.Close()
end with
set objstream=nothing
end function

end class

%>
用了这个类文件,做起事情来就方便多了。
然后就可以分析采集网站的网页结构,写采集程序了。
下面给个例子:
<!--#include file="conn.asp"-->
<!--#include file="inc/xhttp_class.asp"-->
<!--#include file="inc/function.asp"-->
<%
server.ScriptTimeout = 1000
%>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
<title>BT采集器</title>
</head>
<body>
<form name="form1" method="post" action="get81bt.asp">
分类ID:
<input type="text" name="cid" value="<%=request("cid")%>"><br>
开始ID:
<input type="text" name="startid" value="<%=request("startid")%>">
<br>
结束ID:
<input type="text" name="overid" value="<%=request("overid")%>">
<br>
分类名称:<input type="text" name="classname" value="<%=request("classname")%>">为空自动获取
<br>
<input name="action" type="hidden" id="action" value="getdata">
<input type="submit" name="Submit" value="采集">
</form>
当前ID:<%=request("id")%> <br>
<%
dim action
action = Request("action")
if action = "getdata" then
cid = Request("cid")
startid = Request("startid")
overid = Request("overid")
id = Request("id")
if id = "" then id = startid
set objxhttp = new xhttp
objxhttp.URL = "http://www.81dd.com/Class/"&cid&"_"&id&".htm"
content = objxhttp.Html
if InStr(content,"网站维护中") then
call NextID
response.End()
end if
list = GetContent(content,"<!--内容开始-->","<!--内容结束-->",0)
Dim regEx, Match, Matches,patrn
Set regEx = New RegExp
patrn = "<a href=""../BtHtml/(.+?)"">"
regEx.Pattern = patrn
regEx.IgnoreCase = True
regEx.Global = True
Set Matches = regEx.Execute(list)
on error resume next
For Each Match in Matches
'response.write Match.Value & "<br>"
weburl = "http://www.81dd.com/BtHtml/" & regEx.Replace(Match.Value,"$1")
response.write weburl & "<br>"
response.Flush()
objxhttp.URL = weburl
cpage = objxhttp.Html
cpage = GetContent(cpage,"<!--内容开始-->","<!--内容结束-->",0)
title = GetContent(cpage,"BT资源名称:<strong>","</strong>",0)
title = stripHTML(title)
IF Request("classname") <> "" then
classname = Request("classname")
Else
if InStr(title,"喜剧") then
classname = "喜剧"
Elseif InStr(title,"动作") then
classname = "动作"
Elseif InStr(title,"惊悚") then
classname = "惊悚"
Elseif InStr(title,"犯罪") then
classname = "犯罪"
Elseif InStr(title,"恐怖") then
classname = "恐怖"
Elseif InStr(title,"爱情") then
classname = "爱情"
Elseif InStr(title,"冒险") then
classname = "冒险"
Elseif InStr(title,"科幻") then
classname = "科幻"
Elseif InStr(title,"悬念") then
classname = "悬念"
Elseif InStr(title,"奇幻") then
classname = "奇幻"
Elseif InStr(title,"战争") then
classname = "战争"
Elseif InStr(title,"连续剧") then
classname = "连续剧"
Elseif InStr(title,"综艺") then
classname = "综艺"
Elseif InStr(title,"灾难") then
classname = "灾难"
Elseif InStr(title,"伦理") then
classname = "伦理"
Elseif InStr(title,"动漫") or InStr(title,"动画") then
classname = "动漫"
Elseif InStr(title,"国语") or InStr(title,"集") then
classname = "其他影视"
Else
classname = "其他"
End if
End IF
intro = GetContent(cpage,"<tr><td width=770 bgcolor=#FFFFFF><div style=""margin:10px;line-height:150%"">","</div>",0)
intro = Replace(intro,"<br />","[br]")
intro = Replace(intro,"<BR />","[br]")
intro = Replace(intro,"<BR>","[br]")
intro = Replace(intro,"<br>","[br]")
intro = Replace(intro,"<p>","[p]")
intro = Replace(intro,"<P>","[p]")
intro = Replace(intro,"</p>","[/p]")
intro = Replace(intro,"</P>","[p]")
intro = Replace(intro,"<img","[img")
intro = Replace(intro,"<IMG","[img")
intro = stripHTML(intro)
intro = Replace(intro,"[br]","<br>")
intro = Replace(intro,"[p]","<p>")
intro = Replace(intro,"[/p]","</p>")
intro = Replace(intro,"[img","<img")
intro = Replace(intro,"[img]","<img src=")
intro = Replace(intro,"[/img]",">")
intro = Replace(intro,"[IMG]","<img src=")
intro = Replace(intro,"[/IMG]",">")
'response.write t
'response.End()
addtime = Trim(GetContent(cpage,"发布时间:"," ",0))
if Not IsDate(addtime) then addtime = now()
username = "bt"
filesize = GetContent(content,"BT文件大小:"," ",0)
title2 = title
downurl = GetContent(cpage,"<a style=""color:red"" href=""","""",0)
p = CDate(addtime)
Dim sRnd
Randomize
sRnd = Int(900 * Rnd) + 100
sFileName = year(p) & month(p) & day(p) & hour(now) & minute(now) & second(now) & sRnd & ".torrent"
url = "torrent/" & year(p) & "-" & month(p) & "-" & day(p) & "/" & sFileName
Call CreateF(url)
'Text
Response.Write classname & "<br>"
Response.write title & "<br>"
'response.Write intro & "<br>"
'response.Write addtime & "<br>"
'response.Write username & "<br>"
'response.Write filesize & "<br>"
response.Write downurl & "<br>"
response.Write url & "<br>"
response.Flush()
'response.End()
'database
if err.number = 0 then
if (Not IsNull(title)) and title <> "" and downurl <> "" then
set rs = server.CreateObject("adodb.recordset")
sql = "select * from bt_class where classname = '" & classname & "'"
rs.open sql,conn,1,3
if rs.eof then
rs.addnew
rs("classname") = classname
rs.update
end if
classid = rs("classid")
rs.close
set rs = nothing
set rs = server.CreateObject("adodb.recordset")
sql = "select * from bt_movie where title in ('" & title & "')"
rs.open sql,conn,1,3
if rs.eof then
response.Write "<div><font color=blue>写入数据库...</font></div>"
response.Flush()
rs.addnew
rs("classid") = classid
rs("title") = title
rs("title2") = title2
rs("intro") = intro
rs("username") = username
rs("filesize") = filesize
rs("url") = url
rs("serverid") = 1
rs("addtime") = addtime
rs("ismake") = 0
rs.update
objxhttp.URL = downurl
objxhttp.saveimage url,False
else
response.Write "<div><font color=red>已经存在!</font></div>"
end if
rs.close
set rs = nothing
'objxhttp.URL = downurl
'objxhttp.saveimage url,False
End IF
Else
err.clear
End IF
response.Write "-------------------------------------------<br>"
Next
set regEx = nothing

response.Write "下一页<br>"
response.Flush()
Call NextID()
end if
Sub NextID
conn.close
set conn = nothing
if cint(startid) < cint(overid) and cint(id) < cint(overid) then
response.Write "<script>location.href='get81bt.asp?action=getdata&classname=" & Request("classname") & "&cid=" & cid & "&startid=" & startid & "&overid=" & overid & "&id="& id + 1 &"'</script>"
Elseif cint(startid) > cint(overid) and cint(id) > cint(overid) then
response.Write "<script>location.href='get81bt.asp?action=getdata&classname=" & Request("classname") & "&cid=" & cid & "&startid=" & startid & "&overid=" & overid & "&id="& id - 1 &"'</script>"
Else
Response.Write "采集完成!<br>"
response.End()
End if
End Sub
%>
</body>
</html>