共采集了1501条记录,先放着,以后可能有用
源码如下1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62'768.cc网址大全爬行脚本
'For WebSpider2
'梦游的猫
'创建时间 2006年02月23日 23:27:37
'最后修改 2006年02月24日 01:23:39
'Option Explicit
Set conn = CreateObject("ADODB.Connection")
MyConn="Provider=SQLOLEDB.1;Persist Security InFso=true;Data Source=127.0.0.1;Initial Catalog='WebSpider';User ID='spider';Password='spider';CONNect Timeout=30"
Conn.Open MyConn
Sub Main
urlMain = "http://768.cc/DH/page/"
urlType = "党政|新闻|文教|贸易|广告|科技|交通|建筑|工业|日用品|旅游|副食|娱乐|医疗|论坛|其它"
urlTypeArr = split(urlType,"|")
getType = true
for i = 1 to 16
Call Spider.OpenURL(Document,urlMain & i & ".htm")
For Each it in Document.getElementsByTagName("table")
if getType then
For Each itTD in it.getElementsByTagName("td")
if instr(itTD.innerHTML,"**<FONT color=#996600>") > 0 and len(itTD.innerHTML)<70 then
urlType2 = trim(replace(replace(itTD.innerHTML , "<STRONG><FONT color=#996600>",""),"</FONT>**",""))
showLog urlTypeArr(i-1) & " >> " & urlType2
getType = false
exit for
end if
next
elseif not getType then
if it.width="100%" and it.align="center" and it.bgColor="#ffffff" and it.border=0 then
For Each itA in it.getElementsByTagName("a")
if len(itA.href)>4 and instr(itA.href,"http://768.cc")=0 then
writeTMPXML urlTypeArr(i-1) & " >> " & urlType2 & " : " & itA.innerHtml & ":" & itA.href & vbCrlf
saveDate urlTypeArr(i-1),urlType2 ,itA.innerHtml,itA.href
end if
next
getType = true
end if
end if
next
next
End Sub
function saveDate(type1,type2,name,url)
url = replace(url,"'","")
if left(url,7) <> "http://" then url = "http://" & url
set rs = CreateObject("ADODB.RecordSet")
sql = "select * from czNetURL where url = '" & url & "' and type1<>'" & type1 & "'"
rs.open sql,conn,1,3
if rs.eof then
rs.addNew
end if
rs("type1") = trim(type1)
rs("type2") = trim(type2)
rs("name") = trim(name)
rs("url") = trim(url)
rs.update
rs.close
set rs = Nothing
end Function