Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

rai.py@ 15927

Last change on this file since 15927 was 7451, checked in by BPanther, 15 years ago
[ipk] - copy source->source.sh4
File size: 15.1 KB

Line
1	#!/usr/bin/python
2	# rai.py by Ambrosa http://www.ambrosa.net
3	# this module is used for download EPG data from Rai website
4	# derived from E2_LOADEPG
5
6	__author__ = "ambrosa http://www.ambrosa.net"
7	__copyright__ = "Copyright (C) 2008-2011 Alessandro Ambrosini"
8	__license__ = "CreativeCommons by-nc-sa http://creativecommons.org/licenses/by-nc-sa/3.0/"
9
10	import os
11	import sys
12	import time
13	import codecs
14	import socket
15	import string
16	import random
17	import urllib2
18	import ConfigParser
19
20	# import CrossEPG functions
21	import crossepg
22
23	# location of local python modules under "scripts/lib" dir.
24	# add it to sys.path()
25	crossepg_instroot = crossepg.epgdb_get_installroot()
26	if crossepg_instroot == False:
27	sys.exit(1)
28	libdir = os.path.join(crossepg_instroot , 'scripts/lib')
29	sys.path.append(libdir)
30
31	# import local modules
32	import sgmllib
33	import scriptlib
34
35	# =================================================================
36	# HTML PARSER
37
38	class Titolo_parser(sgmllib.SGMLParser):
39
40	def parse(self, s):
41	self.feed(s)
42	self.close()
43
44	def __init__(self, day_get, verbose=0):
45	sgmllib.SGMLParser.__init__(self, verbose)
46	self.daynow = day_get
47	self.daynext = time.strftime("%Y%m%d",time.localtime(time.mktime(time.strptime(day_get,"%Y%m%d"))+86400))
48	self.day = self.daynow
49	self.guidatoday = []
50	self.guidatomorrow = []
51	self.sera = False
52	self.tomorrow = False
53	self.start_orario = False
54	self.start_titolo = False
55	self.inside_a_titolo = False
56	self.inside_palinsesto = False
57
58
59	def start_div(self,attributes):
60	for name,value in attributes:
61	if name == "class":
62	if value == "intG":
63	self.inside_palinsesto = True
64
65	def start_span(self, attributes):
66	if self.inside_palinsesto == True:
67	for name, value in attributes:
68	if name == "class":
69	if value == "ora":
70	self.start_orario = True
71	if value == "info":
72	self.start_titolo = True
73
74	def start_a(self,attributes):
75	if self.inside_palinsesto == True:
76	if self.start_titolo == True:
77	self.inside_a_titolo = True
78
79	def handle_data(self, data):
80	if self.inside_palinsesto == True:
81
82	if self.start_orario == True:
83
84	# if time < 06:00 is a next day event
85	if int(time.strftime("%H",time.strptime(data,"%H:%M"))) < 6 :
86	self.day = self.daynext
87	self.tomorrow = True
88	else:
89	if self.tomorrow == True:
90	self.inside_a_titolo = False
91	self.start_titolo = False
92	self.inside_palinsesto = False
93	return
94
95	self.dataoraevento = time.strftime("%Y-%m-%d %H:%M",time.strptime(self.day+'-'+data,"%Y%m%d-%H:%M"))
96	self.start_orario = False
97
98	if self.inside_a_titolo == True:
99	if self.tomorrow == False:
100	self.guidatoday.append((self.dataoraevento,data.strip()))
101	else:
102	self.guidatomorrow.append((self.dataoraevento,data.strip()))
103
104	self.inside_a_titolo = False
105	self.start_titolo = False
106	self.inside_palinsesto = False
107
108
109	def get_guida(self):
110	return ((self.guidatoday,self.guidatomorrow))
111
112
113	# =================================================================
114
115
116	class main:
117
118	# main config file
119	CONF_CONFIGFILENAME = "rai.conf"
120
121	# Network socket timeout (in seconds)
122	CONF_SOCKET_TIMEOUT = 20
123
124	# log text
125	CONF_LOG_SCRIPT_NAME = "RAI (Italy)"
126	CONF_LOG_PREFIX = "RAI: "
127
128	# retry number if HTTP error
129	HTTP_ERROR_RETRY = 3
130	# seconds to wait between retries
131	HTTP_ERROR_WAIT_RETRY = 5
132
133	# random time delay (in seconds) between access to remote web pages
134	CONF_RANDOM_MIN = 0.0
135	CONF_RANDOM_MAX = 2.0
136
137	# charset used in remote website epg data
138	REMOTE_EPG_CHARSET = 'utf-8'
139
140	TODAY = ''
141	DAYCACHE = []
142	FIELD_SEPARATOR = '###'
143	CHANNELLIST = {}
144
145
146	def log(self,s,video=0):
147	self.logging.log(self.CONF_LOG_PREFIX + str(s))
148	if video == 1:
149	self.log2video(str(s))
150
151	def log2video(self,s):
152	self.logging.log2video_status(str(s))
153
154
155	def __init__(self,confdir,dbroot):
156
157	# initialize logging
158	self.logging = scriptlib.logging_class()
159	# write to video OSD the script name
160	self.logging.log2video_scriptname(self.CONF_LOG_SCRIPT_NAME)
161
162
163	CONF_FILE = os.path.join(confdir,self.CONF_CONFIGFILENAME)
164	if not os.path.exists(CONF_FILE) :
165	self.log("ERROR: %s not present" % CONF_FILE,1)
166	sys.exit(1)
167
168	config = ConfigParser.ConfigParser()
169	config.optionxform = str # needed to return case sensitive index
170	config.read(CONF_FILE)
171
172	# reading [global] section options
173	self.CONF_DEFAULT_PROVIDER = config.get("global","DEFAULT_PROVIDER")
174	# save cache under dbroot
175	self.CONF_CACHEDIR = os.path.join(dbroot,config.get("global","CACHE_DIRNAME"))
176
177	self.CONF_MAX_DAY_EPG = config.getint("global","MAX_DAY_EPG")
178	self.CONF_URL = config.get("global","URL")
179
180	self.CONF_GMT_ZONE = config.get("global","GMT_ZONE")
181	if self.CONF_GMT_ZONE.strip(' ').lower() == 'equal':
182	#self.DELTA_UTC = -scriptlib.delta_utc() # return negative if timezone is east of GMT (like Italy), invert sign
183	self.DELTA_UTC = 0
184	else:
185	self.DELTA_UTC = float(self.CONF_GMT_ZONE)*3600.0
186	if self.DELTA_UTC >= 0:
187	self.DELTA_UTC = self.DELTA_UTC + scriptlib.delta_dst()
188	else:
189	self.DELTA_UTC = self.DELTA_UTC - scriptlib.delta_dst()
190
191	self.DELTA_UTC = int(self.DELTA_UTC)
192	#self.log("Website timezone - UTC = %d seconds" % self.DELTA_UTC)
193
194	if not os.path.exists(self.CONF_CACHEDIR):
195	self.log("Creating \'%s\' directory for caching" % self.CONF_CACHEDIR)
196	os.mkdir(self.CONF_CACHEDIR)
197
198	# reading [channels] section
199	temp=config.items("channels");
200
201	# create a dictionary (Python array) with index = channel ID
202	for i in temp:
203	self.CHANNELLIST[i[0]] = unicode(i[1],'utf-8')
204
205	if len(self.CHANNELLIST) == 0 :
206	self.log("ERROR: [channels] section empty ?",1)
207	sys.exit(1)
208
209	# set network socket timeout
210	socket.setdefaulttimeout(self.CONF_SOCKET_TIMEOUT)
211
212	# initialize random generator
213	random.seed()
214
215	# today date (format AAAAMMDD)
216	self.TODAY = time.strftime("%Y%m%d")
217
218	# create a list filled with dates (format AAAAMMDD) from today to today+MAX_DAY_EPG
219	self.DAYCACHE=[self.TODAY]
220	for day in range(1,self.CONF_MAX_DAY_EPG):
221	self.DAYCACHE.append(time.strftime("%Y%m%d",time.localtime(time.time()+86400*day)))
222
223
224	# ----------------------------------------------------------------------
225
226
227	def download_and_cache(self):
228	self.log("--- START DOWNLOAD AND CACHE DATA ---")
229	self.log2video("STARTING DOWNLOAD")
230
231	self.log("Removing old cached files")
232	scriptlib.cleanup_oldcachedfiles(self.CONF_CACHEDIR, self.FIELD_SEPARATOR)
233
234	#self.log("Start downloading HTML data from \'%s\'" % self.CONF_URL)
235
236	chlist = self.CHANNELLIST
237
238	# get remote XML files
239	# chid format: channel id , 0\|1\|2(,new name)
240	# i.e. ("101" , "1,SkyCinema1")
241	for c in sorted(chlist.keys()):
242	self.guidatoday = []
243	self.guidatomorrow = []
244
245	# get cache option
246	# 0 : don't download/cache
247	# 1 : download and cache (optional 1,new_name )
248	# 2 : always download overwriting existing files (optional 2,new_name )
249	# 3 : always download overwriting existing files only for TODAY (optional 3,new_name )
250
251	cacheopt = int(string.split(chlist[c],",")[0])
252
253	# if cacheopt == 0, do nothing
254	if cacheopt == 0:
255	continue
256
257	channel_name = ''
258	if len(chlist[c].split(",")) > 1 :
259	if chlist[c].split(",")[1] != '' :
260	# channel renamed, new name provided by user
261	channel_name = chlist[c].split(",")[1].strip(' ').lower()
262
263	# if channel name is not present as option, quit with error
264	if channel_name == '':
265	self.log("ERROR ! ID=%s channel name not present" % c, 1)
266	sys.exit(1)
267
268	channel_provider = self.CONF_DEFAULT_PROVIDER
269	if len(chlist[c].split(",")) > 2 :
270	if chlist[c].split(",")[2] != '' :
271	channel_provider = chlist[c].split(",")[2].strip(' ').lower()
272
273	exit_for_loop = False
274	for day in self.DAYCACHE:
275	if exit_for_loop == True:
276	break
277
278	day_get = time.strftime("%Y_%m_%d",time.strptime(day,"%Y%m%d"))
279	xmlfile = "?%s_%s" % (c,day_get)
280
281	# download only if file doesn't exist or cacheopt == 2 (always download),
282	# using open(...,"w") files will be overwritten (saving a delete + create)
283
284	eventfilename = scriptlib.fn_escape(str(c) + self.FIELD_SEPARATOR + channel_name + self.FIELD_SEPARATOR + day)
285	eventfilepath = os.path.join(self.CONF_CACHEDIR, eventfilename)
286	if (cacheopt == 1) and os.path.exists(eventfilepath):
287	continue
288	if (cacheopt == 3) and os.path.exists(eventfilepath) and (day != self.TODAY):
289	continue
290	if (cacheopt != 1) and (cacheopt != 2) and (cacheopt != 3):
291	self.log("Warning: unknown cache option " + str(cacheopt))
292	exit_for_loop = True
293	continue
294
295	self.log("Download HTML data from \'%s\'" % (self.CONF_URL + xmlfile))
296	self.log2video("Download " + c)
297
298	i = self.HTTP_ERROR_RETRY
299	while i > 0 :
300	# wait randomly to avoid overloading website
301	time.sleep(random.uniform(self.CONF_RANDOM_MIN, self.CONF_RANDOM_MAX))
302
303	try:
304	sock=urllib2.urlopen(self.CONF_URL + xmlfile)
305	data=sock.read()
306
307	except IOError, e:
308	serr="unknown"
309	if hasattr(e, 'reason'):
310	serr=str(e.reason)
311	elif hasattr(e, 'code'):
312	serr=str(e.code)
313	if hasattr(e, 'msg'):
314	serr+=" , "+str(e.msg)
315
316	self.log("\'%s\' connection error. Reason: %s. Waiting %d sec. and retry [%d] ..." % (self.CONF_URL + xmlfile, serr, self.HTTP_ERROR_WAIT_RETRY, i))
317	time.sleep(self.HTTP_ERROR_WAIT_RETRY) # add sleep
318	i -= 1
319
320	else:
321	i = 0 # force quit WHILE loop
322	sock.close()
323
324	dtparser = Titolo_parser(day)
325	dtparser.parse(data)
326	self.guida = self.guidatomorrow
327	(self.guidatoday, self.guidatomorrow) = dtparser.get_guida()
328
329	# if no data, quit for loop and stop downloading
330	if len(self.guidatoday) == 0:
331	exit_for_loop = True
332	break
333
334	self.guida = self.guida + self.guidatoday
335
336	self.log(" writing in cache \'%s\'" % eventfilename)
337	# write data in cache file using UTF-8 encoding
338	fd = codecs.open(eventfilepath, "w", 'utf-8')
339	fd.write(str(c) + self.FIELD_SEPARATOR + channel_name + self.FIELD_SEPARATOR + channel_provider + self.FIELD_SEPARATOR + day + '\n')
340	fd.write("Local Time (human readeable)###Unix GMT Time###Event Title###Event Description\n")
341
342	# extract all events and put in eventfile
343	for event in self.guida:
344	(dataora,titolo) = event
345	event_starttime = dataora
346	# time.mktime return Unix time inside GMT timezone
347	event_startime_unix_gmt = str(int(time.mktime(time.strptime(event_starttime,"%Y-%m-%d %H:%M"))) - self.DELTA_UTC )
348	#event_startime_unix_gmt = str(int(time.mktime(time.strptime(event_starttime,"%Y-%m-%d %H:%M"))) )
349	#self.log(event_starttime + " , " + str(self.DELTA_UTC) + " , " + str(int(time.mktime(time.strptime(event_starttime,"%Y-%m-%d %H:%M")))) + " , " + event_startime_unix_gmt )
350
351	# convert remote data (RAI website use UTF-8) in Python Unicode (UCS2)
352	event_title = unicode(titolo,self.REMOTE_EPG_CHARSET)
353
354	event_title = event_title.replace('\r','')
355	event_title = event_title.replace('\n',u' ')
356	event_title = event_title.strip(u' ')
357
358	event_description = u''
359
360	fd.write(event_starttime + self.FIELD_SEPARATOR + event_startime_unix_gmt + self.FIELD_SEPARATOR + event_title + self.FIELD_SEPARATOR + event_description + '\n')
361
362	fd.close()
363
364
365	# ----------------------------------------------------------------------
366
367
368	def process_cache(self):
369	self.log("--- START PROCESSING CACHE ---")
370	self.log2video("START PROCESSING CACHE")
371	if not os.path.exists(self.CONF_CACHEDIR):
372	self.log("ERROR: %s not present" % self.CONF_CACHEDIR,1)
373	sys.exit(1)
374
375	self.log("Loading lamedb")
376	lamedb = scriptlib.lamedb_class()
377
378	self.log("Initialize CrossEPG database")
379	crossdb = scriptlib.crossepg_db_class()
380	crossdb.open_db()
381
382	events = []
383	previous_id = ''
384	channels_name = ''
385	total_events = 0
386
387	self.log("Start data processing")
388	filelist = sorted(os.listdir(self.CONF_CACHEDIR))
389	filelist.append('*END*')
390
391	for f in filelist :
392	id = f.split(self.FIELD_SEPARATOR)[0]
393	if previous_id == '':
394	previous_id = id
395
396	if id != previous_id :
397	total_events += len(events)
398	self.log(" ...processing \'%s\' , nr. events %d" % (previous_id,len(events)))
399	self.log2video("processed %d events ..." % total_events )
400
401	for c in channels_name:
402	# a channel can have zero or more SID (different channel with same name)
403	# return the list [0e1f:00820000:0708:00c8:1:0 , 1d20:00820000:2fa8:013e:1:0 , ..... ]
404	# return [] if channel name is not in lamedb
405	sidbyname = lamedb.get_sid_byname(c.strip(' \n').lower())
406
407	# process every SID
408	for s in sidbyname:
409	# convert "0e1f:00820000:0708:00c8:1:0" to sid,tsid,onid
410	# return the list [sid,tsid,onid]
411	ch_sid = lamedb.convert_sid(s)
412	if len(ch_sid) == 0:
413	continue
414
415	# add channel into db
416	# doesn't matter if the channel already exist... epgdb do all the work
417	crossdb.add_channel(ch_sid)
418
419	i = 0
420	L = len(events) - 1
421
422	# process events
423	for e in events:
424
425	e_starttime = int(e.split(self.FIELD_SEPARATOR)[1])
426
427	if i < L :
428	e_length = int(events[i+1].split(self.FIELD_SEPARATOR)[1]) - e_starttime
429	else:
430	# last event, dummy length 90 min.
431	e_length = 5400
432	i += 1
433
434	# extract title and encode Python Unicode with UTF-8
435	e_title = e.split(self.FIELD_SEPARATOR)[2].encode('utf-8')
436
437	# RAI website HAVE NOT long description. (bleah !).
438	e_summarie = u' '
439	# encode Python Unicode in UTF-8
440	e_summarie = e_summarie.encode('utf-8')
441
442	# add_event(start_time , duration , title , summarie , ISO639_language_code , strings_encoded_with_UTF-8)
443	crossdb.add_event(e_starttime, e_length, e_title, e_summarie, 'ita', True )
444
445	if f == '*END*':
446	break
447
448	events = []
449	previous_id = id
450	channels_name = ''
451
452	if id == previous_id:
453	self.log("Reading \'%s\'" % f)
454	# read events from cache file using UTF-8 and insert them in events list
455	fd = codecs.open(os.path.join(self.CONF_CACHEDIR, f),"r","utf-8")
456	lines = fd.readlines()
457	fd.close()
458	if channels_name == '':
459	# first line has channel data (id,name,provider,date)
460	channels_name = lines[0].split(self.FIELD_SEPARATOR)[1].split('\|')
461	# the second line is only a remark
462	# add events starting from third line
463	events.extend(lines[2:])
464
465	# end process, close CrossEPG DB saving data
466	crossdb.close_db()
467	self.log("TOTAL EPG EVENTS PROCESSED: %d" % total_events)
468	self.log("--- END ---")
469	self.log2video("END , events processed: %d" % total_events)
470
471
472
473	# ****************************************************************************************************************************
474
475	# MAIN CODE: SCRIPT START HERE
476
477	SCRIPT_DIR = 'scripts/rai/'
478
479	# get CrossEPG installation dir.
480	crossepg_instroot = crossepg.epgdb_get_installroot()
481	if crossepg_instroot == False:
482	sys.exit(1)
483	scriptlocation = os.path.join(crossepg_instroot , SCRIPT_DIR)
484
485	# get where CrossEPG save data (dbroot) and use it as script cache repository
486	crossepg_dbroot = crossepg.epgdb_get_dbroot()
487	if crossepg_dbroot == False:
488	sys.exit(1)
489
490	# initialize script class
491	script_class = main(scriptlocation , crossepg_dbroot)
492
493	# download data and cache them
494	script_class.download_and_cache()
495
496	# read cached data and inject into CrossEPG database
497	script_class.process_cache()
498

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: ipk/source/epg_crossepg/var/crossepg/scripts/rai/rai.py@ 15927

Download in other formats: