伟的博客 研发工程师

校内网发帖机inPython,请勿滥用2009-07-01

2011-03-09

#!/usr/bin/python #encoding=utf-8 #使用前请查找并更改用户名和密码

import cookielib, urllib2, urllib, sys, time from xml.sax.saxutils import unescape from BeautifulSoup import BeautifulSoup # For processing HTML

def formalize(text): result = ‘’ lines = text.split(u’\n’) for line in lines: line = line.strip() if len(line) == 0: continue result += line + u’\n\n’ return result

#登陆校内网 cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) exheaders = [(“User-Agent”,”Mozilla/4.0 (compatible; MSIE 7.1; Windows NT 5.1; SV1)”),] opener.addheaders=exheaders url_login = ‘http://xiaonei.com/Login.do’ body = ((‘email’,’xxxxx@gmail.com’), (‘password’,’*****’)) #TODO:更改登录名和密码 print “ERROR! you need to update the password to be successful!” req1 = opener.open(url_login, urllib.urlencode(body)) #这时,cookie已经进来了。

#下载糗事百科,一个个发帖 body = {‘relative_optype’:’publisher’, ‘blogControl’:’1’} url_post = ‘http://blog.xiaonei.com/NewEntry.do’

#发帖部分 count = 0 for i in range(11, 12): url = “http://qiushibaike.com/qiushi/best/all/page/%d” % i data = urllib2.urlopen(url).readlines() soup = BeautifulSoup(““.join(data)) contents = soup.findAll(‘div’, “content”) stories = [str(text) for text in contents] for story in stories: count += 1 print “processing page %d, %d items added” % (i, count) minisoup = BeautifulSoup(story) #text = ‘‘.join([e for e in minisoup.recursiveChildGenerator() if isinstance(e, unicode)]) #text = urllib.unquote(unescape(text, {‘”’:’”’})) text = str(minisoup) #text = text.encode(“utf-8”) title = ‘糗事-%d’ % count text += ‘
来自糗事百科
’ body[‘title’] = title body[‘body’] = text req2 = opener.open(url_post, urllib.urlencode(body)) #不出意外的话,就已经发帖成功了


Comments