อยากอ่านข่าวในเว็บข่าวหลายเว็บ
แต่เว็บข่าวในไทยหลายเว็บ support rss แต่ไม่ ได้เป็นแบบ full feed ก็เราอยากดูดข่าวมาอ่านในมือถือ ด้วย plucker นิหว่า
ทำไงดี เขียนไพทอน ดูด feed มาแล้วทำการโหลดเว็บ จากลิงค์ใน feed พร้อมทั้ง ตัดแต่เนื้อหาออกมา สร้างเป็นไฟล์ htmlใหม่ ดีกว่า
- ที่เขียนนี้ดูดข่าวจากเว็บ ผู้จัดการ ข่าวสด เดลินิวส์ ประชาไทย เฉพาะบางหัวข้อ ที่ผมสนใจ
- ที่เขียนนี้ feed ถ้า error จะ try ไปเรื่อย ๆ
- เข้ารหัสตัวอักษรใหม่ เป็น แบบ tis620
import feedparser
import urllib2
import socket
import re
def strip_html_tags(value):
return re.sub(r'<[^>]*?>', '', value)
timeout = 6
ListWebs = [{"allurl":["http://www.prachatai.com/rss/"],"webencoding":"tis620","feedencoding":"tis620","beginBody":'<td align="left"><br>',"endBody":'<table width="100%" border="0" cellspacing="0" cellpadding="0">',"endHead":'</div>'},
{"allurl":["http://www.matichon.co.th/khaosod/rss/politics_news.xml","http://www.matichon.co.th/khaosod/rss/foreign_news.xml","http://www.matichon.co.th/khaosod/rss/economy_news.xml","http://www.matichon.co.th/khaosod/rss/motor_news.xml","http://www.matichon.co.th/khaosod/rss/technology_news.xml"],"webencoding":"tis620","feedencoding":"utf8","beginBody":'<td align=left valign=top><font face=Tahoma, MS Sans Serif, CordiaUPC, DB ThaiText Extra, Thonburi size=3>',"endBody":'</td></tr></table>',"beginHead":'<div class="topic4">',"endHead":'</div>'},
{"allurl":["http://www.manager.co.th/RSS/Around/Around.xml","http://www.manager.co.th/RSS/China/China.xml","http://www.manager.co.th/RSS/Business/Business.xml ","http://www.manager.co.th/RSS/StockMarket/StockMarket.xml","http://www.manager.co.th/RSS/Motoring/Motoring.xml","http://www.manager.co.th/RSS/Cyberbiz/Cyberbiz.xml","http://www.manager.co.th/RSS/Science/Science.xml"],"webencoding":"tis620","beginBody":'<td align="left" valign="baseline" class="body">\r\n',"endBody":'\r\n</td>\r\n</tr>\r\n</table>',"beginHead":'<td align="left" valign="baseline" class="headline">',"endHead":'</td>'},
{"allurl":["http://ads.dailynews.co.th/rss/1.xml","http://ads.dailynews.co.th/rss/2.xml","http://ads.dailynews.co.th/rss/5.xml","http://ads.dailynews.co.th/rss/6.xml","http://ads.dailynews.co.th/rss/8.xml"],"webencoding":"utf8","feedencoding":"utf8","beginBody":'<div id="news_detail" >',"endBody":'</table>\r\n</div>',"beginHead":'<div class="topic4">',"endHead":'</div>'},
]
socket.setdefaulttimeout(timeout)
indexFile = open("index.html","w")
indexFile.write("<html><head><title>news_now</title></head>\n<body>\n")
itemIndex = 0
for web in ListWebs:
allurl = web["allurl"]
webencoding = web["webencoding"]
beginBody = web["beginBody"]
endBody = web["endBody"]
for url in allurl:
numtry = 0
wetry=1
while numtry<100 or wetry!=0:
numtry = numtry+1
wetry=0
try:
d = feedparser.parse(url)
title = d.feed.title
except:
wetry=1
print "feederror",url
oldDate = (2006, 9, 4, 12, 57, 14, 3, 248, 0)
indexFile.write("<h1>"+title.encode("tis620","ignore")+"</h1><br>\n")
nencoding = d.encoding
print title.encode("tis620","ignore")
for item in d.entries:
itemIndex = itemIndex+1
DateItem = item.updated_parsed
urlItem = item.link
itemTitle = item.title
itemTitle = itemTitle.encode("tis620","ignore")
req = urllib2.Request(urlItem)
wetry=1
numtry=0
while wetry!=0 or numtry > 100:
try:
numtry = numtry+1
itemRawData = urllib2.urlopen(req)
itemTextData = itemRawData.read()
wetry = 0
except IOError, e:
print numtry,"url",urlItem
wetry =1
if hasattr(e, 'reason'):
print 'Reason: ', e.reason
elif hasattr(e, 'code'):
print 'Error code: ', e.code
except:
print numtry,"url",urlItem
wetry =1
itemBegin = itemTextData.find(beginBody)
itemEnd = itemTextData.find(endBody,itemBegin)
itemTextData = itemTextData[itemBegin:itemEnd]
itemTextData = itemTextData.replace(" "," ");
stag = strip_html_tags(itemTitle)
indexFile.write('<a href="'+str(itemIndex)+'.html">'+stag+'</a><br>\n')
stag2=strip_html_tags(itemTextData)
stag2 = stag2.decode(webencoding,"ignore")
dataFile = open(str(itemIndex)+".html","w")
dataFile.write("<html><head><title>"+stag+"</title></head>\n<body>")
dataFile.write("<h1>"+stag+"</h1><br>\n")
dataFile.write(stag2.encode("tis620","ignore"))
dataFile.write("</body></html>")
dataFile.close()
print stag
indexFile.close()
indexFile = open("index.html","a")
indexFile.write("</body></html>")
indexFile.close()
import urllib2
import socket
import re
def strip_html_tags(value):
return re.sub(r'<[^>]*?>', '', value)
timeout = 6
ListWebs = [{"allurl":["http://www.prachatai.com/rss/"],"webencoding":"tis620","feedencoding":"tis620","beginBody":'<td align="left"><br>',"endBody":'<table width="100%" border="0" cellspacing="0" cellpadding="0">',"endHead":'</div>'},
{"allurl":["http://www.matichon.co.th/khaosod/rss/politics_news.xml","http://www.matichon.co.th/khaosod/rss/foreign_news.xml","http://www.matichon.co.th/khaosod/rss/economy_news.xml","http://www.matichon.co.th/khaosod/rss/motor_news.xml","http://www.matichon.co.th/khaosod/rss/technology_news.xml"],"webencoding":"tis620","feedencoding":"utf8","beginBody":'<td align=left valign=top><font face=Tahoma, MS Sans Serif, CordiaUPC, DB ThaiText Extra, Thonburi size=3>',"endBody":'</td></tr></table>',"beginHead":'<div class="topic4">',"endHead":'</div>'},
{"allurl":["http://www.manager.co.th/RSS/Around/Around.xml","http://www.manager.co.th/RSS/China/China.xml","http://www.manager.co.th/RSS/Business/Business.xml ","http://www.manager.co.th/RSS/StockMarket/StockMarket.xml","http://www.manager.co.th/RSS/Motoring/Motoring.xml","http://www.manager.co.th/RSS/Cyberbiz/Cyberbiz.xml","http://www.manager.co.th/RSS/Science/Science.xml"],"webencoding":"tis620","beginBody":'<td align="left" valign="baseline" class="body">\r\n',"endBody":'\r\n</td>\r\n</tr>\r\n</table>',"beginHead":'<td align="left" valign="baseline" class="headline">',"endHead":'</td>'},
{"allurl":["http://ads.dailynews.co.th/rss/1.xml","http://ads.dailynews.co.th/rss/2.xml","http://ads.dailynews.co.th/rss/5.xml","http://ads.dailynews.co.th/rss/6.xml","http://ads.dailynews.co.th/rss/8.xml"],"webencoding":"utf8","feedencoding":"utf8","beginBody":'<div id="news_detail" >',"endBody":'</table>\r\n</div>',"beginHead":'<div class="topic4">',"endHead":'</div>'},
]
socket.setdefaulttimeout(timeout)
indexFile = open("index.html","w")
indexFile.write("<html><head><title>news_now</title></head>\n<body>\n")
itemIndex = 0
for web in ListWebs:
allurl = web["allurl"]
webencoding = web["webencoding"]
beginBody = web["beginBody"]
endBody = web["endBody"]
for url in allurl:
numtry = 0
wetry=1
while numtry<100 or wetry!=0:
numtry = numtry+1
wetry=0
try:
d = feedparser.parse(url)
title = d.feed.title
except:
wetry=1
print "feederror",url
oldDate = (2006, 9, 4, 12, 57, 14, 3, 248, 0)
indexFile.write("<h1>"+title.encode("tis620","ignore")+"</h1><br>\n")
nencoding = d.encoding
print title.encode("tis620","ignore")
for item in d.entries:
itemIndex = itemIndex+1
DateItem = item.updated_parsed
urlItem = item.link
itemTitle = item.title
itemTitle = itemTitle.encode("tis620","ignore")
req = urllib2.Request(urlItem)
wetry=1
numtry=0
while wetry!=0 or numtry > 100:
try:
numtry = numtry+1
itemRawData = urllib2.urlopen(req)
itemTextData = itemRawData.read()
wetry = 0
except IOError, e:
print numtry,"url",urlItem
wetry =1
if hasattr(e, 'reason'):
print 'Reason: ', e.reason
elif hasattr(e, 'code'):
print 'Error code: ', e.code
except:
print numtry,"url",urlItem
wetry =1
itemBegin = itemTextData.find(beginBody)
itemEnd = itemTextData.find(endBody,itemBegin)
itemTextData = itemTextData[itemBegin:itemEnd]
itemTextData = itemTextData.replace(" "," ");
stag = strip_html_tags(itemTitle)
indexFile.write('<a href="'+str(itemIndex)+'.html">'+stag+'</a><br>\n')
stag2=strip_html_tags(itemTextData)
stag2 = stag2.decode(webencoding,"ignore")
dataFile = open(str(itemIndex)+".html","w")
dataFile.write("<html><head><title>"+stag+"</title></head>\n<body>")
dataFile.write("<h1>"+stag+"</h1><br>\n")
dataFile.write(stag2.encode("tis620","ignore"))
dataFile.write("</body></html>")
dataFile.close()
print stag
indexFile.close()
indexFile = open("index.html","a")
indexFile.write("</body></html>")
indexFile.close()
- ยังอยากปรับปรุงให้เป้น multi thread อยู่
- package feed parser ตั้งค่า timeout ตรงไหนฟะu
- ระวังเรื่อง string encoding เพราะ feedparser ได้แอบใช้ package ชื่อ chardet ในการระบุ encoding ถ้าไม่มี มันจะระบุเป็น utf8 อย่างมั่ว ๆ
(5 votes)

