标签归档:rss

Python 之 XML与文本操作

一直想做一个自动读取RSS,自动更新的站。今晚正好没有事,先写个Demo。
临时先保存到文本文件中。顺便复习一下Python。
用到了MD5,XML.DOM,URllib,OS库,都是默认的。
直接插代码吧。

import sys
#coding=gb2312
reload(sys)
sys.setdefaultencoding('utf-8')
import md5
import os
import urllib
import xml.dom.minidom as xxs

#Config
tmpPath='E:/Else/rss-engins/'

#FileName2Md5
def md5str(t):
	h=md5.new()
	h.update(t)
	return str(h.hexdigest())
	
#FileSystem
def saveFile(t,c):
	f='data/'+md5str(t)+'.txt'
	if os.path.isfile(f):
		return
	fi=open(tmpPath+'index.txt','a')
	fi.writelines(t+'\n')
	fg=open(tmpPath+f,'w')
	fg.writelines(c)
	
#RSS
def saveRss(f,c,encoding):
	fp=open(f,'w')
	if encoding=='gb2312':
		c=c.decode('gb2312').encode('utf-8')
		c=c.replace('encoding="gb2312"','encoding="utf-8"')
	fp.writelines(c)
	fp.close()
	return f
def getRss(url):
	ul=urllib.urlopen(url)
	t=ul.read()
	ul.close()
	return t
def rssMain(url,encoding):
	f=tmpPath+'xml/'+md5str(url)+'.xml'
	if os.path.isfile(f):
		return "isok"
	return saveRss(f,getRss(url),encoding)

#getInfo
def getRssInfo(url,encoding):
	t=rssMain(url,encoding)
	if t=='isok':
		print '该路径已经采集过了。'
		return
	t=xxs.parse(t)
	ri=t.getElementsByTagName("channel")[0].getElementsByTagName("item")
	for item in ri:
		title=str(item.getElementsByTagName("title")[0].childNodes[0].data)
		content=str(item.getElementsByTagName("description")[0].childNodes[0].data)
		print '保存文章('+title+')中...'
		saveFile(title,content)

#处理
def BaiduHi(RS):
	for rs in RS:
		print '加载用户:'+rs+'(http://hi.baidu.com/'+rs+'/rss)...'
		getRssInfo(r'http://hi.baidu.com/'+rs+'/rss','gb2312')
		print '用户:'+rs+'处理完毕。\n'
		
def SohuBlog(RS):
	for rs in RS:
		print '加载用户:'+rs+'(http://'+rs+'.blog.sohu.com/rss)...'
		getRssInfo(r'http://'+rs+'.blog.sohu.com/rss','utf-8')
		print '用户:'+rs+'处理完毕。\n'
		
#RSS服务器列表
RssServer=['mqycn','yilin','loveinmyhome','guojing021','900ip','037123']
BaiduHi(RssServer)

RssServer=['appler969','saber-bing','02040229','seahai','satanqueen','douzwang']
SohuBlog(RssServer)

现在就可以采集以上一个百度hi用户的最新博文了。呵呵,核心还是RSS。

===============================================================
使用说明:读取RSS,并生成记录到本地。
索引文件为index.txt。每条新闻记录为一行。对应的内容文件为data/{md5(文章名)}.txt

===============================================================
更新日期:2009-12-13
添加了是否读取的判断,如果读取,则不再采集

===============================================================
更新日期:2009-12-24
修改了部分编码。解决了UTF-8出粗的问题。
添加了搜狐博客的测试代码。

Asp生成RSS的类[转]

改程序的作者为飞飞,不是本人原创,使用的时候如果方便别忘了价格飞飞的链接哦,支持原创。。。

<%
Dim Rs,Newrss
Class Rss
‘========参数调用说明==================
‘作者 : 飞飞
‘网站 :  [url=http://www.ffasp.com]www.ffasp.com[/url]
‘QQ  :  276230416
‘邮箱 :  [url=mailto:huanmie913@163.com]huanmie913@163.com[/url]
‘注:您可以更改和使用本程序,但请保留作者和出处
‘*******************输入参数********************
‘***********************************************
‘SetConn          必填      网站使用的Connection对象
‘SetSql          必填      Sql查询语句。强烈建议使用在Sql语句中使用Top关键字
‘        Sql语句中包含的字段[新闻Id,标题,内容,时间,静态页名称]
‘                            注:不要颠倒顺序
‘                            如果新闻不是生成的静态页,则无最后一项,SetPageType的值则为1
‘SetWebName      必填      网站名称
‘SetWebUrl        必填      网站的地址
‘SetWebDes        非必填    网站的描述信息
‘SetPageType      必填      信息显示页的链接类型 1 为动态页面Id  0为静态页面
‘SetMaxInfo      非必填    强制显示信息的数目,若取数据>SetMaxInfo
则显示SetMaxInfo条数据。强烈建议使用在Sql语句中使用Top关键字
‘setContentShow  非必填    信息简介设置。注意:该参数为数组(ShowContentType,ShowContentLen)
‘                                    ShowContentType  [数字类型]
为内容显示方式[参数(0,1)0为按百分比就算显示信息,1为按字数]
‘                                    ShowContentLen  内容显示的长度
由ShowContentType 决定实际长度
‘*****************输出参数********************
‘ShowRss  显示Rss
‘======================================================
‘例如
‘Set NewRss=New Rss
‘ Set NewRss.SetConn=article_conn
‘ NewRss.SetSql=”select top 30 newsid,title,content,dateandtime,N_fname
from article where typeid=1 order by newsid Desc”
‘ NewRss.SetWebName=”测试中”

NewRss.SetWebUrl=”[url=http://www.ffasp.com/rss/rss.asp]http://www.ffasp.com/rss/rss.asp[/url]”

‘ NewRss.SetMaxInfo=10
‘ NewRss.SetInfourl=”[url=http://www.ffasp.com]http://www.ffasp.com[/url]”

‘ NewRss.SetPageType=”0″
‘ NewRss.setContentShow=”1,200″
‘ NewRss.ShowRss()
‘Set NewRss=Nothing
‘======================================================

Private Conn,Sql,WebName,WebUrl,WebDes,Er,MaxInfo,i,Infourl,PageType
Private ShowContentType,ShowContentLen
Private AllContent,AllContentLen

Private Sub Class_initialize()
MaxInfo=20
‘PageType=1
ShowContentType=0
ShowContentLen=20
Er=false
End Sub

Private Sub Class_terminate()
If isObject(Rs) then Set Rs=Nothing
End Sub

Public Property Let Errmsg(msg)
If Er then
Response.Clear()
Response.Write(msg)
Response.End()
End If
End Property

Public Property Let SetWebName(WebName_)
WebName=WebName_
End Property

Public Property Let SetWebUrl(WebUrl_)
WebUrl=WebUrl_
End Property

Public Property Let SetWebDes(webDes_)
WebDes=WebDes_
End Property

Public Property Let SetInfoUrl(Infourl_)
Infourl=Infourl_
End Property

Public Property Let SetPageType(PageType_)
PageType=PageType_
End Property

Public Property Let SetMaxInfo(MaxInfo_)
MaxInfo=MaxInfo_
End Property

Public Property Let setContentShow(ContentShow_)
Dim ArrContentShow
ArrContentShow=Split(ContentShow_,”,”)
If Ubound(ArrContentShow)<>1 Then Er=True:Errmsg=”信息显示参数设置有误!!”
ShowContentType=ArrContentShow(0)
ShowContentLen=ArrContentShow(1)
If Not isnumeric(ShowContentType) or ShowContentType=”” Then
ShowContentType=0
If Not isnumeric(ShowContentLen) or ShowContentLen=”” Then
If ShowContentType=0  Then ShowContentLen=20 Else ShowContentLen=200
Else
If ShowContentType=0 and (ShowContentLen>100 or ShowContentLen<10) Then
ShowContentLen=20
End If
End Property

Public Property Set SetConn(Conn_)
If TypeName(Conn_)=”Connection”  Then
Set Conn=Conn_
Else
Er=true
Errmsg=”数据库连接错误”
Exit property
End If
End Property

Public Property Let SetSql(sql_)
Sql=Sql_
End Property

Public Property Get RssHead()
RssHead=”<?xml version=””1.0″” encoding=””gb2312″” ?> ”
RssHead=RssHead&”<rss>”
RssHead=RssHead&”<channel>”
RssHead=RssHead&”<title>”&WebName&”</title>”
RssHead=RssHead&”<link>”&WebUrl&”</link>”
RssHead=RssHead&”<description>”&WebDes&”</description>”
End Property

Private Property Get RssBottom()
RssBottom=”</channel>”
RssBottom=RssBottom&”</rss>”
End Property

Public Sub ShowRss()
On Error resume Next
Dim Rs
Dim ShowInfoUrl,ShowContent,Content
If TypeName(Conn)<>”Connection” Then Er=True:Errmsg=”Connection对象有误”
If Sql=”” or isnull(Sql)=”” or isempty(Sql)=”” Then
Er=True:Errmsg=”没有可执行的Sql语句”
If WebName=”” or isnull(WebName)=”” or isempty(WebName)=”” Then
Er=True:Errmsg=”请设置RSS标题”
If WebUrl=”” or isnull(WebUrl)=”” or isempty(WebUrl)=”” Then
Er=True:Errmsg=”请设置网站的链接”
If InfoUrl=”” or isnull(InfoUrl)=”” or isempty(InfoUrl)=”” Then
Er=True:Errmsg=”请设置链接信息”
If PageType=”” or isnull(PageType)=”” or isempty(PageType)=”” Then
Er=True:Errmsg=”请设置链接类型”
Set Rs=Server.CreateObject(“ADODB.RecordSet”)
Rs.Open Sql,Conn,1,1
If Err Then
Er=true
Errmsg=”数据库未能打开
请检查您的Sql语句是否正确”
Exit Sub
End If

Response.Charset = “gb2312″
Response.ContentType=”text/xml”
Response.Write(RssHead)
For i =1 to MaxInfo
‘*****************************
ShowInfoUrl=InfoUrl
If ShowInfoUrl=”” or isnull(ShowInfoUrl) or isempty(ShowInfoUrl) Then
ShowInfoUrl=”#”
Else
If PageType Then ShowInfoUrl=ShowInfoUrl&Rs(0) Else
ShowInfoUrl=ShowInfoUrl&Rs(4)
End If
‘*****************************
AllContent=LoseHtml(Rs(2))
AllContentLen=byteLen(AllContent)
ShowContent=int(ShowContentLen)
If ShowContentType=0 Then ShowContent=AllContentLen*ShowContent/100
Content=Server.HTMLEncode(titleb(AllContent,ShowContent))
Response.Write(“<item>”)
Response.Write(“<title>”)
Response.Write(Rs(1))
Response.Write(“</title>”)
Response.Write(“<link>”)
Response.Write(ShowInfoUrl)
Response.Write(“</link>”)
Response.Write(“<description>”)
Response.Write(Content)
Response.Write(“</description>”)
Response.Write(“<pubDate>”)
Response.Write(return_RFC822_Date(Rs(3),”GMT”))
Response.Write(“</pubDate>”)
Response.Write(“</item>”)
If Rs.Eof or i>cint(MaxInfo) Then Exit For
Rs.MoveNext
Next
Response.Write(RssBottom)
End Sub

Function LoseHtml(ContentStr)
Dim ClsTempLoseStr,regEx
ClsTempLoseStr = Cstr(ContentStr)
Set regEx = New RegExp
regEx.Pattern = “<\/*[^<>]*>”
regEx.IgnoreCase = True
regEx.Global = True
ClsTempLoseStr = regEx.Replace(ClsTempLoseStr,””)
LoseHtml = ClsTempLoseStr
End function

Function return_RFC822_Date(byVal myDate, byVal TimeZone)
Dim myDay, myDays, myMonth, myYear
Dim myHours, myMinutes, mySeconds

myDate = CDate(myDate)
myDay = EnWeekDayName(myDate)
myDays = Right(“00” & Day(myDate),2)
myMonth = EnMonthName(myDate)
myYear = Year(myDate)
myHours = Right(“00” & Hour(myDate),2)
myMinutes = Right(“00” & Minute(myDate),2)
mySeconds = Right(“00″ & Second(myDate),2)

return_RFC822_Date = myDay&”, “& _
myDays&” “& _
myMonth&” “& _
myYear&” “& _
myHours&”:”& _
myMinutes&”:”& _
mySeconds&” “& _
” ” & TimeZone
End Function
Function EnWeekDayName(InputDate)
Dim Result
Select Case WeekDay(InputDate,1)
Case 1:Result=”Sun”
Case 2:Result=”Mon”
Case 3:Result=”Tue”
Case 4:Result=”Wed”
Case 5:Result=”Thu”
Case 6:Result=”Fri”
Case 7:Result=”Sat”
End Select
EnWeekDayName = Result
End Function

Function EnMonthName(InputDate)
Dim Result
Select Case Month(InputDate)
Case 1:Result=”Jan”
Case 2:Result=”Feb”
Case 3:Result=”Mar”
Case 4:Result=”Apr”
Case 5:Result=”May”
Case 6:Result=”Jun”
Case 7:Result=”Jul”
Case 8:Result=”Aug”
Case 9:Result=”Sep”
Case 10:Result=”Oct”
Case 11:Result=”Nov”
Case 12:Result=”Dec”
End Select
EnMonthName = Result
End Function

function titleb(str,strlen)
Dim Bstrlen
bstrlen=strlen
If isempty(str) or isnull(str) or str=”” Then
titleb=str
exit function
Else
dim l,t,c,i
l=len(str)
t=0

for i=1 to l
c=Abs(Asc(Mid(str,i,1)))
if c>255 then
t=t+2
else
t=t+1
end if

if t>=bstrlen then
titleb=left(str,i)
exit for
else
titleb=str&””
end if
next
End If
end function

function byteLen(str)
dim lenStr,lenTemp,i
lenStr=0
lenTemp=len(str)
dim strTemp
for i=1 to lenTemp
strTemp=asc(mid(str,i,1))
if strTemp>255 or strTemp<=0 then
lenStr=lenStr+2
else
lenStr=lenStr+1
end if
next
byteLen=lenStr
end function
End Class
%>