无聊之下爬行了小飞的网址大全

2006-02-24

WebSpider

共采集了1501条记录，先放着，以后可能有用

源码如下

'768.cc网址大全爬行脚本
'For WebSpider2
'梦游的猫
'创建时间 2006年02月23日 23:27:37
'最后修改 2006年02月24日 01:23:39

'Option Explicit
Set conn = CreateObject("ADODB.Connection")
MyConn="Provider=SQLOLEDB.1;Persist Security InFso=true;Data Source=127.0.0.1;Initial Catalog='WebSpider';User ID='spider';Password='spider';CONNect Timeout=30"
Conn.Open MyConn

Sub Main
 urlMain = "http://768.cc/DH/page/"
 urlType = "党政|新闻|文教|贸易|广告|科技|交通|建筑|工业|日用品|旅游|副食|娱乐|医疗|论坛|其它"

 urlTypeArr = split(urlType,"|")
 getType = true
 for i = 1 to 16
  Call Spider.OpenURL(Document,urlMain & i & ".htm")
  For Each it in Document.getElementsByTagName("table")

   if getType then
    For Each itTD in it.getElementsByTagName("td")
     if instr(itTD.innerHTML,"**<FONT color=#996600>") > 0 and len(itTD.innerHTML)<70 then
      urlType2 = trim(replace(replace(itTD.innerHTML , "<STRONG><FONT color=#996600>",""),"</FONT>**",""))
      showLog urlTypeArr(i-1) & " >> " & urlType2
      getType = false
      exit for
     end if
    next
   elseif not getType then
    if it.width="100%" and it.align="center" and it.bgColor="#ffffff" and it.border=0 then
     For Each itA in it.getElementsByTagName("a")
      if len(itA.href)>4 and instr(itA.href,"http://768.cc")=0 then
       writeTMPXML urlTypeArr(i-1) & " >> " & urlType2 & " : " & itA.innerHtml & ":" & itA.href & vbCrlf
       saveDate urlTypeArr(i-1),urlType2 ,itA.innerHtml,itA.href
      end if
     next
     getType = true
    end if
   end if
  next
 next
End Sub

function saveDate(type1,type2,name,url)
 url = replace(url,"'","")
 if left(url,7) <> "http://" then url = "http://" & url
 set rs = CreateObject("ADODB.RecordSet")
 sql = "select * from czNetURL where url = '" & url & "' and type1<>'" & type1 & "'"
 rs.open sql,conn,1,3
 if rs.eof then
  rs.addNew
 end if
 rs("type1") = trim(type1)
 rs("type2") = trim(type2)
 rs("name") = trim(name)
 rs("url") = trim(url)
 rs.update
 rs.close
 set rs = Nothing
end Function

jsonContent: meta: false pages: false posts: title: true date: true path: true text: false raw: false content: false slug: false updated: false comments: false link: false permalink: false excerpt: false categories: false tags: true