当前位置: 首页 > 图文教程 > 脚本技术 > Python > 下载糗事百科的内容_python版

Python
Python 可爱的大小写
Python 条件判断的缩写方法
Python struct.unpack
Python splitlines使用技巧
比较详细Python正则表达式操作指南(re使用)
Python 过滤字符串的技巧,map与itertools.imap
Python open读写文件实现脚本
Python linecache.getline()读取文件中特定一行的脚本
Python 时间处理datetime实例
Python 命令行参数sys.argv
Python httplib,smtplib使用方法
Python 拷贝对象(深拷贝deepcopy与浅拷贝copy)
Python enumerate遍历数组示例应用
Python 初始化多维数组代码
Python 深入理解yield
Python __getattr__与__setattr__使用方法
Python 网络编程起步(Socket发送消息)
Python urlopen 使用小示例
Python 调用VC++的动态链接库(DLL)
新手该如何学python怎么学好python?

Python 中的 下载糗事百科的内容_python版


出处:互联网   整理: 软晨网(RuanChen.com)   发布: 2009-09-11   浏览: 61 ::
收藏到网摘: n/a

代码是没问题的,可以正常运行,但是希望做到以下2方面: 1、多线程下载 2、代码分离度更高,跟面向对象
复制代码 代码如下:

#coding:utf-8
import urllib.request
import xml.dom.minidom
import sqlite3
import threading
import time
class logger(object):
def log(self,*msg):
for i in msg:
print(i)
Log = logger()
Log.log('测试下')
class downloader(object):
def __init__(self,url):
self.url = url
def download(self):
Log.log('开始下载',self.url)
try:
content = urllib.request.urlopen(self.url).read()
#req = urllib.request.Request(url)
#response = urllib.request.urlopen(req)
#content = response.read()
Log.log('下载完毕')
return(content)
except:
Log.log('下载出错')
return(None)

class parser(object):
def __init__(self,content):
#获得根节点
self.html = xml.dom.minidom.parseString(content)
def parse(self):
Log.log('开始提取数据')
contents = {'content':'','url':[]}
#获得div节点
divs = self.html.getElementsByTagName('div')
#获得content节点
for div in divs:
if div.hasAttribute('class') and \
div.getAttribute('class') == 'content':
#获得糗事百科的内容
textNode = div.childNodes[0]
qContent = textNode.data
#数据填充
contents['content'] = qContent
#获得上一糗事、下一糗事节点
spans = self.html.getElementsByTagName('span')
for span in spans:
pspan = span.parentNode
if pspan.tagName == 'a':
#pspan为对应的链接,此时需要将对应的地址加入数据库
url = pspan.getAttribute('href')
qid = url[10:][:-4]
#数据填充
contents['url'].append(qid)
Log.log('提取数据完毕')
return(contents)
def downloadPage(qid,db):
url = 'http://www.qiushibaike.com/articles/'+str(qid)+'.htm'
content = downloader(url).download()
if content:
contents = parser(content).parse()
if contents['content']:
db.updateContent(qid,contents['content'])
for i in contents['url']:
db.addQID(i)
if len(contents['url']) == 2:
db.updateStatus(qid,2)
#下载池,表示同时允许下载的链接个数
class downloaderPool(object):
def __init__(self,maxLength=15):
self.downloaders = [None]*maxLength
self.downloadList = []
self.db = None
def setDownloadList(self,downloadList):
self.downloadList = list(set(self.downloadList+downloadList))
def setdb(self,db):
self.db = db
def daemon(self):
#每隔一秒查询线程的状态,为非活动线程则设置为None
Log.log('设置守护进程')
for index,downloader in enumerate(self.downloaders):
if downloader:
if not downloader.isAlive():
Log.log('将下载器置空',index)
self.downloaders[index] = None
#检查线程池状态
for index,downloader in enumerate(self.downloaders):
if not downloader:
qid = self.getQID()
if qid:
#创建线程
t = threading.Thread(target=downloadPage,args=(qid,self.db))
self.downloaders[index] = t
t.start()
t.join()
Log.log('设置下载器',index)
#间隔一秒执行一次
time.sleep(1)
def getQID(self):
try:
tmp = self.downloadList[0]
del self.downloadList[0]
return(tmp)
except:
return(None)
def beginDownload(self):
#创建守护线程
daemon = threading.Thread(target=self.daemon)
daemon.setDaemon(True)
daemon.start()
daemon.join()
def getDownloader(self):
for index,downloader in enumerate(self.downloaders):
if not downloader:
return(index)
return(None)

ADD_Q_ID = 'insert into qiushibaike(id,success) values(?,?)'
UPDATE_Q_CONTENT = 'update qiushibaike set content=? where id=?'
UPDATE_Q_STATUS = 'update qiushibaike set success=? where id=?'
Q_LIST = 'select id from qiushibaike where success=?'
Q_LIST_BY_ID = 'select count(*) from qiushibaike where id=?'
class dbConnect(object):
"""
create table qiushibaike(
id,Integer
content,Varchar
success,Interger
)
#id表示糗事的ID
#content表示糗事的内容
#success表示是否下载成功,当该糗事内容下载完成,且获得上一页、下一页ID时表示下载完成
1表示未完成
2表示完成
"""
def __init__(self,dbpath='db.sqlite'):
self.dbpath = dbpath
def addQID(self,qid):
Log.log('插入糗事百科',qid)
#获得连接
cn = sqlite3.connect(self.dbpath)
c = cn.cursor()
try:
#添加内容并提交
c.execute(ADD_Q_ID,(qid,1))
cn.commit()
except:
Log.log('添加ID出错',qid)
#关闭连接
c.close()
cn.close()
Log.log('插入成功')
def updateContent(self,qid,content):
Log.log('更新糗事百科',qid,content)
#获得连接
cn = sqlite3.connect(self.dbpath)
c = cn.cursor()
#添加内容并提交
c.execute(UPDATE_Q_CONTENT,(content,qid))
cn.commit()
#关闭连接
c.close()
cn.close()
Log.log('更新成功')
def updateStatus(self,qid,flag):
Log.log('更新状态',qid,flag)
#获得连接
cn = sqlite3.connect(self.dbpath)
c = cn.cursor()
#添加内容并提交
c.execute(UPDATE_Q_STATUS,(flag,qid))
cn.commit()
#关闭连接
c.close()
cn.close()
Log.log('更新状态成功')
def getList(self,unDonloaded=1):
Log.log('获得列表')
l = []
#获得连接
cn = sqlite3.connect(self.dbpath)
c = cn.cursor()
#获得数据
c.execute(Q_LIST,(unDonloaded,))
rows = c.fetchall()
for i in rows:
l.append(i[0])
#关闭连接
c.close()
cn.close()
Log.log('获得列表成功')
return(l)
class singleDownloader(object):
def __init__(self):
self.downloadList = []
def setdb(self,db):
self.db = db
def setDownloadList(self,downloadList):
self.downloadList = list(set(self.downloadList+downloadList))
def beginDownload(self):
for i in self.downloadList:
downloadPage(i,self.db)
def main():
db = dbConnect('db.sqlite')
#dp = downloaderPool()
#dp.setdb(db)
sp = singleDownloader()
sp.setdb(db)
dp=sp
unDownloadedList = db.getList()
#当还有未下载的糗事时就要继续下载
while(len(unDownloadedList)):
#使用该列表填充下载池
dp.setDownloadList(unDownloadedList)
dp.beginDownload()
time.sleep(1)
#重置参数
unDownloadedList = db.getList()
if __name__ == '__main__':
main()

代码是没问题的,可以正常运行,但是希望做到以下2方面:
1、多线程下载
2、代码分离度更高,跟面向对象