本文共 2704 字,大约阅读时间需要 9 分钟。
freebuf爬虫
#C:\Python27\python.exe#coding:utf-8import sysreload(sys)sys.setdefaultencoding("utf-8")import reimport osimport urllibimport requestsfrom multiprocessing import Poolsubject_dict = {u'漏洞':'http://www.freebuf.com/vuls', u'安全工具':'http://www.freebuf.com/sectool', u'WEB安全':'http://www.freebuf.com/articles/web', u'系统安全':'http://www.freebuf.com/articles/system', u'网络安全':'http://www.freebuf.com/articles/network', u'无线安全':'http://www.freebuf.com/articles/wireless', u'终端安全':'http://www.freebuf.com/articles/terminal', u'数据安全':'http://www.freebuf.com/articles/database', u'安全管理':'http://www.freebuf.com/articles/security-management', u'企业安全':'http://www.freebuf.com/articles/es', u'极客':'http://www.freebuf.com/geek'}def spider(filename, url): print "Crawling subject: %s" % filename if os.path.isfile(filename + ".html"): os.remove(filename + ".html") with open(filename + ".html",'a') as f: page = 0 error_couter = 0 while True: page += 1 try: html = requests.get(url + '/page/' + str(page)) code = html.status_code if code == 404: error_couter += 1 if error_couter == 1: print "Subject %s may only have %s pages." % (filename, str(page - 1)) if error_couter <= 3: print "Retrying %s: 404 not Found!" % str(error_couter) continue else: print "Subject %s finished!" % filename print "#################################" break else: print u"Parsing page: " + str(page) if page == 1: site = re.findall('([\s\S]*) \n',html.text,re.S) else: site = re.findall('([\s\S]*?)\n',html.text,re.S) for each in site: f.write(urllib.unquote(each.encode('utf-8'))) except Exception as e: print e pass f.close()def main(): for key,value in subject_dict.items(): spider(key, value) # pool = Pool(processes=4) # for i in range(0, subject_dict.__len__()): # arg_list = subject_dict.items()[i] # pool.apply_async(spider, (arg_list[0], arg_list[1],)).get(timeout=None) # pool.close() # pool.join()if __name__ == '__main__': main()