#!/usr/bin/python #encoding=utf-8 #使用前请查找并更改用户名和密码
import cookielib, urllib2, urllib, sys, time from xml.sax.saxutils import unescape from BeautifulSoup import BeautifulSoup # For processing HTML
def formalize(text): result = ‘’ lines = text.split(u’\n’) for line in lines: line = line.strip() if len(line) == 0: continue result += line + u’\n\n’ return result
#登陆校内网 cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) exheaders = [(“User-Agent”,”Mozilla/4.0 (compatible; MSIE 7.1; Windows NT 5.1; SV1)”),] opener.addheaders=exheaders url_login = ‘http://xiaonei.com/Login.do’ body = ((‘email’,’xxxxx@gmail.com’), (‘password’,’*****’)) #TODO:更改登录名和密码 print “ERROR! you need to update the password to be successful!” req1 = opener.open(url_login, urllib.urlencode(body)) #这时,cookie已经进来了。
#下载糗事百科,一个个发帖 body = {‘relative_optype’:’publisher’, ‘blogControl’:’1’} url_post = ‘http://blog.xiaonei.com/NewEntry.do’
#发帖部分
count = 0
for i in range(11, 12):
url = “http://qiushibaike.com/qiushi/best/all/page/%d” % i
data = urllib2.urlopen(url).readlines()
soup = BeautifulSoup(““.join(data))
contents = soup.findAll(‘div’, “content”)
stories = [str(text) for text in contents]
for story in stories:
count += 1
print “processing page %d, %d items added” % (i, count)
minisoup = BeautifulSoup(story)
#text = ‘‘.join([e for e in minisoup.recursiveChildGenerator() if isinstance(e, unicode)])
#text = urllib.unquote(unescape(text, {‘”’:’”’}))
text = str(minisoup)
#text = text.encode(“utf-8”)
title = ‘糗事-%d’ % count
text += ‘
来自糗事百科
’
body[‘title’] = title
body[‘body’] = text
req2 = opener.open(url_post, urllib.urlencode(body)) #不出意外的话,就已经发帖成功了