#/usr/bin/env python
#coding: utf-8
import urllib2
import re
import xlwt
link_list = []
xls = xlwt.Workbook()
table = xls.add_sheet("main")
for i in xrange(288, 318):
link_list.append("http://www.lanqiao.org/Article/ShowArticle.asp?ArticleID=%d"%i)
chinese = []
content = ''
while len(link_list) > 0:
for link in link_list:
print link
try:
content = urllib2.urlopen(url = link, timeout = 3).read()
link_list = link_list[1:]
except:
print "timeout!"
if link not in link_list[1:]:
link_list.append(link)
some = re.findall(u'<TR.*?<\/TR>', content.decode('gbk', 'ignore'), re.S)
for s in some:
chinese_value = re.findall(u'(?<=\u5b8b\u4f53>).*(?=<\/FONT>)', s)
chinese.append(','.join(chinese_value))
for line in xrange(0, len(chinese)):
tmp = chinese[line].split(',')
for i in xrange(0, len(tmp)):
table.write(line, i, tmp[i])
print "%d, %d, %s" % (line, i, tmp[i])
xls.save('x.xls')
print len(link_list)