伟的博客 研发工程师

练手写的python网页采集脚本2009-06-11

2011-03-09

现在学习python 做为练手写了个网站内容采集脚本. 实现功能如下: 1.根据配置文件进行采集. 2.支持命令行参数 如: python corn.py –config=urls.ini 3.根据规则生成url列表集(只支持数字,能倒序采集). 4.根据规则获取列表页面特定位置,从而进一步分析缩小范围分析内容页面url. 5.将内容部分url存储到文件,每行一个url,并且在写入的时候进行判断是否已经存在相同url. 6.Bug太多,我慢慢完善. 使用方法: ubuntu环境 终端运行 python xxx.py –config=xxx.ini windows环境 修改#!/usr/bin/python 为你的python.exe目录 在命令行运行 python xxx.py –config=xxx.ini 下面给出代码 保存成.py文件: Python代码 #!/usr/bin/python #--coding:utf-8-- # 简单的蜘蛛采集程序 # 小项 # 2008-09-18 import sys; import getopt; import re; import urllib; import ConfigParser; import time; import MySQLdb as mysql; if name == “main”: try: opts,argv = getopt.getopt(sys.argv[1:],’c:’,[‘config=’]); except getopt.GetoptError: Help() for keys,cut in opts: #if keys in (‘-w’,’–write’): # writefile() if keys in (‘-c’,’–config’): cut try: conf = ConfigParser.ConfigParser(); conf.readfp(open(cut)); #==读取采集配置文件==# #–urllibs–# starturl = conf.get(“urllibs”,”starturl”); startpage = int(conf.get(“urllibs”,”startpage”)); endpage = int(conf.get(“urllibs”,”endpage”)); urltemp = starturl + conf.get(“urllibs”,”urltemp”); filelist = conf.get(“urllibs”,”urllist”); dellist = conf.get(“urllibs”,”dellist”); #–countcfg–# Stops = int(conf.get(“countcfg”,”Stops”)); Divurl = conf.get(“countcfg”,”Divurl”); Urlls = conf.get(“countcfg”,”Urlls”); Title = conf.get(“countcfg”,”Title”); Keywords = conf.get(“countcfg”,”Keywords”); Description = conf.get(“countcfg”,”Description”); url = [ urltemp % page for page in range(startpage,endpage) ]; for url in url: urllist = urllib.urlopen(url).read(); #读取web文件 urls = re.findall(Divurl,urllist); #获取指定的特定区域 #将urls{属性为list}转换成str属性的以便于再次获取 c = “”; urls = c.join(urls); urlls = re.findall(Urlls,urls); urlfile = file(filelist,’r+a’); #打开文件进行追加数据 outurl = urlfile.readlines(); for urlls in set(urlls): #==整理url并进行重复判断==# curls = urlls + “\n”; if curls in outurl: #对找到的连接列表进行循环 print urlls + “页面重复跳过”; continue; # break 是属于整个跳过,continue 是属于跳回去继续执行 urlfile.write(urlls + ‘\n’); #循环输出 urlfile.close(); #关闭文件 print “所有url列表获取完成,存入”,filelist,”文件中”; time.sleep(Stops); #同样停顿一下 #==每次读取列表文件的一行并且指针下移一行,这样可以循环获取每个列表.难题为如何获取整个文件的行数==# listurl = open(filelist,’r’); mun = len(listurl.readlines())+1; listurl.seek(0); #指针跳会文件开头 #contents = open(‘contens.txt’,’a’); User = ‘root’; Passwd = ‘970207’; Host = ‘localhost’; Db = ‘testcorn’; contents = mysql.connect(user=User,passwd=Passwd,host=Host,db=Db).cursor(); #==进行内容循环输出==# for conurl in range(1,mun): curl = listurl.readline(); #okurl = r’http://www.510buy.com/yewu/5662.html’; #print okurl; time.sleep(Stops); #停顿一下,免得被封,or超时 进行下一次采集 content = urllib.urlopen(starturl + curl).read(); #读取url title = re.findall(Title,content); #找到标题 keywords = re.findall(Keywords,content); #找到关键词 description = re.findall(Description,content); #找到描述 for title,keywords,description in zip(set(title),set(keywords),set(description)): #print title; #测试查看输出结果 #contents.write(‘[Title:]’ + title + ‘\n’); #contents.write(‘[Keyword:]’ + keywords + ‘\n’); #contents.write(‘[Description:]’ + description + ‘\n\n’); #values = “(“ + “"” + title + “"”,”"” + keywords + “"”,”"” + description + “"” + “)” #contents.execute(“INSERT INTO counts (title ,keywords ,description) VALUES (%s, %s, %s);”,values); #title = title.encode(title,”utf=8”) print “写入”,title,”成功!”,”停顿”,Stops,”秒进行下一次采集”; contents.close(); except KeyboardInterrupt: print “用户终止”; 下面是ini的配置文件 保存成.ini文件: Ini配置文件代码 [urllibs] #目标网址 starturl = http://www.510buy.com #列表开始页码 startpage = 2 #列表结束页码 endpage = 3 #列表部分代码 urltemp = /yewu/list_%d.html #存储url列表文件路径 urllist = /home/buysz/桌面/urllist.ini #剔除不需要的url列表,中间用,隔开 dellist = http://www.510buy.com,http://www.510buy.com” target=”_blank,/yewu/index.html,http://www.510buy.com/ [countcfg] #采集停顿秒数 Stops = 1 #采集特定位置url规则 Divurl = <div.?>(.?)<\/div> #采集url的正则 Urlls = <a href="|'["']> #标题正则 Title = (.*?) - .*? #关键词正则 Keywords = name="keywords" content="(.?)"> #页面描述正则 Description = name="description" content="(.?)">


Comments