source: ipk/source/epg_crossepg/var/crossepg/scripts/rai/rai.py@ 15927

Last change on this file since 15927 was 7451, checked in by BPanther, 15 years ago

[ipk] - copy source->source.sh4

File size: 15.1 KB
Line 
1#!/usr/bin/python
2# rai.py by Ambrosa http://www.ambrosa.net
3# this module is used for download EPG data from Rai website
4# derived from E2_LOADEPG
5
6__author__ = "ambrosa http://www.ambrosa.net"
7__copyright__ = "Copyright (C) 2008-2011 Alessandro Ambrosini"
8__license__ = "CreativeCommons by-nc-sa http://creativecommons.org/licenses/by-nc-sa/3.0/"
9
10import os
11import sys
12import time
13import codecs
14import socket
15import string
16import random
17import urllib2
18import ConfigParser
19
20# import CrossEPG functions
21import crossepg
22
23# location of local python modules under "scripts/lib" dir.
24# add it to sys.path()
25crossepg_instroot = crossepg.epgdb_get_installroot()
26if crossepg_instroot == False:
27 sys.exit(1)
28libdir = os.path.join(crossepg_instroot , 'scripts/lib')
29sys.path.append(libdir)
30
31# import local modules
32import sgmllib
33import scriptlib
34
35# =================================================================
36# HTML PARSER
37
38class Titolo_parser(sgmllib.SGMLParser):
39
40 def parse(self, s):
41 self.feed(s)
42 self.close()
43
44 def __init__(self, day_get, verbose=0):
45 sgmllib.SGMLParser.__init__(self, verbose)
46 self.daynow = day_get
47 self.daynext = time.strftime("%Y%m%d",time.localtime(time.mktime(time.strptime(day_get,"%Y%m%d"))+86400))
48 self.day = self.daynow
49 self.guidatoday = []
50 self.guidatomorrow = []
51 self.sera = False
52 self.tomorrow = False
53 self.start_orario = False
54 self.start_titolo = False
55 self.inside_a_titolo = False
56 self.inside_palinsesto = False
57
58
59 def start_div(self,attributes):
60 for name,value in attributes:
61 if name == "class":
62 if value == "intG":
63 self.inside_palinsesto = True
64
65 def start_span(self, attributes):
66 if self.inside_palinsesto == True:
67 for name, value in attributes:
68 if name == "class":
69 if value == "ora":
70 self.start_orario = True
71 if value == "info":
72 self.start_titolo = True
73
74 def start_a(self,attributes):
75 if self.inside_palinsesto == True:
76 if self.start_titolo == True:
77 self.inside_a_titolo = True
78
79 def handle_data(self, data):
80 if self.inside_palinsesto == True:
81
82 if self.start_orario == True:
83
84 # if time < 06:00 is a next day event
85 if int(time.strftime("%H",time.strptime(data,"%H:%M"))) < 6 :
86 self.day = self.daynext
87 self.tomorrow = True
88 else:
89 if self.tomorrow == True:
90 self.inside_a_titolo = False
91 self.start_titolo = False
92 self.inside_palinsesto = False
93 return
94
95 self.dataoraevento = time.strftime("%Y-%m-%d %H:%M",time.strptime(self.day+'-'+data,"%Y%m%d-%H:%M"))
96 self.start_orario = False
97
98 if self.inside_a_titolo == True:
99 if self.tomorrow == False:
100 self.guidatoday.append((self.dataoraevento,data.strip()))
101 else:
102 self.guidatomorrow.append((self.dataoraevento,data.strip()))
103
104 self.inside_a_titolo = False
105 self.start_titolo = False
106 self.inside_palinsesto = False
107
108
109 def get_guida(self):
110 return ((self.guidatoday,self.guidatomorrow))
111
112
113# =================================================================
114
115
116class main:
117
118 # main config file
119 CONF_CONFIGFILENAME = "rai.conf"
120
121 # Network socket timeout (in seconds)
122 CONF_SOCKET_TIMEOUT = 20
123
124 # log text
125 CONF_LOG_SCRIPT_NAME = "RAI (Italy)"
126 CONF_LOG_PREFIX = "RAI: "
127
128 # retry number if HTTP error
129 HTTP_ERROR_RETRY = 3
130 # seconds to wait between retries
131 HTTP_ERROR_WAIT_RETRY = 5
132
133 # random time delay (in seconds) between access to remote web pages
134 CONF_RANDOM_MIN = 0.0
135 CONF_RANDOM_MAX = 2.0
136
137 # charset used in remote website epg data
138 REMOTE_EPG_CHARSET = 'utf-8'
139
140 TODAY = ''
141 DAYCACHE = []
142 FIELD_SEPARATOR = '###'
143 CHANNELLIST = {}
144
145
146 def log(self,s,video=0):
147 self.logging.log(self.CONF_LOG_PREFIX + str(s))
148 if video == 1:
149 self.log2video(str(s))
150
151 def log2video(self,s):
152 self.logging.log2video_status(str(s))
153
154
155 def __init__(self,confdir,dbroot):
156
157 # initialize logging
158 self.logging = scriptlib.logging_class()
159 # write to video OSD the script name
160 self.logging.log2video_scriptname(self.CONF_LOG_SCRIPT_NAME)
161
162
163 CONF_FILE = os.path.join(confdir,self.CONF_CONFIGFILENAME)
164 if not os.path.exists(CONF_FILE) :
165 self.log("ERROR: %s not present" % CONF_FILE,1)
166 sys.exit(1)
167
168 config = ConfigParser.ConfigParser()
169 config.optionxform = str # needed to return case sensitive index
170 config.read(CONF_FILE)
171
172 # reading [global] section options
173 self.CONF_DEFAULT_PROVIDER = config.get("global","DEFAULT_PROVIDER")
174 # save cache under dbroot
175 self.CONF_CACHEDIR = os.path.join(dbroot,config.get("global","CACHE_DIRNAME"))
176
177 self.CONF_MAX_DAY_EPG = config.getint("global","MAX_DAY_EPG")
178 self.CONF_URL = config.get("global","URL")
179
180 self.CONF_GMT_ZONE = config.get("global","GMT_ZONE")
181 if self.CONF_GMT_ZONE.strip(' ').lower() == 'equal':
182 #self.DELTA_UTC = -scriptlib.delta_utc() # return negative if timezone is east of GMT (like Italy), invert sign
183 self.DELTA_UTC = 0
184 else:
185 self.DELTA_UTC = float(self.CONF_GMT_ZONE)*3600.0
186 if self.DELTA_UTC >= 0:
187 self.DELTA_UTC = self.DELTA_UTC + scriptlib.delta_dst()
188 else:
189 self.DELTA_UTC = self.DELTA_UTC - scriptlib.delta_dst()
190
191 self.DELTA_UTC = int(self.DELTA_UTC)
192 #self.log("Website timezone - UTC = %d seconds" % self.DELTA_UTC)
193
194 if not os.path.exists(self.CONF_CACHEDIR):
195 self.log("Creating \'%s\' directory for caching" % self.CONF_CACHEDIR)
196 os.mkdir(self.CONF_CACHEDIR)
197
198 # reading [channels] section
199 temp=config.items("channels");
200
201 # create a dictionary (Python array) with index = channel ID
202 for i in temp:
203 self.CHANNELLIST[i[0]] = unicode(i[1],'utf-8')
204
205 if len(self.CHANNELLIST) == 0 :
206 self.log("ERROR: [channels] section empty ?",1)
207 sys.exit(1)
208
209 # set network socket timeout
210 socket.setdefaulttimeout(self.CONF_SOCKET_TIMEOUT)
211
212 # initialize random generator
213 random.seed()
214
215 # today date (format AAAAMMDD)
216 self.TODAY = time.strftime("%Y%m%d")
217
218 # create a list filled with dates (format AAAAMMDD) from today to today+MAX_DAY_EPG
219 self.DAYCACHE=[self.TODAY]
220 for day in range(1,self.CONF_MAX_DAY_EPG):
221 self.DAYCACHE.append(time.strftime("%Y%m%d",time.localtime(time.time()+86400*day)))
222
223
224# ----------------------------------------------------------------------
225
226
227 def download_and_cache(self):
228 self.log("--- START DOWNLOAD AND CACHE DATA ---")
229 self.log2video("STARTING DOWNLOAD")
230
231 self.log("Removing old cached files")
232 scriptlib.cleanup_oldcachedfiles(self.CONF_CACHEDIR, self.FIELD_SEPARATOR)
233
234 #self.log("Start downloading HTML data from \'%s\'" % self.CONF_URL)
235
236 chlist = self.CHANNELLIST
237
238 # get remote XML files
239 # chid format: channel id , 0|1|2(,new name)
240 # i.e. ("101" , "1,SkyCinema1")
241 for c in sorted(chlist.keys()):
242 self.guidatoday = []
243 self.guidatomorrow = []
244
245 # get cache option
246 # 0 : don't download/cache
247 # 1 : download and cache (optional 1,new_name )
248 # 2 : always download overwriting existing files (optional 2,new_name )
249 # 3 : always download overwriting existing files only for TODAY (optional 3,new_name )
250
251 cacheopt = int(string.split(chlist[c],",")[0])
252
253 # if cacheopt == 0, do nothing
254 if cacheopt == 0:
255 continue
256
257 channel_name = ''
258 if len(chlist[c].split(",")) > 1 :
259 if chlist[c].split(",")[1] != '' :
260 # channel renamed, new name provided by user
261 channel_name = chlist[c].split(",")[1].strip(' ').lower()
262
263 # if channel name is not present as option, quit with error
264 if channel_name == '':
265 self.log("ERROR ! ID=%s channel name not present" % c, 1)
266 sys.exit(1)
267
268 channel_provider = self.CONF_DEFAULT_PROVIDER
269 if len(chlist[c].split(",")) > 2 :
270 if chlist[c].split(",")[2] != '' :
271 channel_provider = chlist[c].split(",")[2].strip(' ').lower()
272
273 exit_for_loop = False
274 for day in self.DAYCACHE:
275 if exit_for_loop == True:
276 break
277
278 day_get = time.strftime("%Y_%m_%d",time.strptime(day,"%Y%m%d"))
279 xmlfile = "?%s_%s" % (c,day_get)
280
281 # download only if file doesn't exist or cacheopt == 2 (always download),
282 # using open(...,"w") files will be overwritten (saving a delete + create)
283
284 eventfilename = scriptlib.fn_escape(str(c) + self.FIELD_SEPARATOR + channel_name + self.FIELD_SEPARATOR + day)
285 eventfilepath = os.path.join(self.CONF_CACHEDIR, eventfilename)
286 if (cacheopt == 1) and os.path.exists(eventfilepath):
287 continue
288 if (cacheopt == 3) and os.path.exists(eventfilepath) and (day != self.TODAY):
289 continue
290 if (cacheopt != 1) and (cacheopt != 2) and (cacheopt != 3):
291 self.log("Warning: unknown cache option " + str(cacheopt))
292 exit_for_loop = True
293 continue
294
295 self.log("Download HTML data from \'%s\'" % (self.CONF_URL + xmlfile))
296 self.log2video("Download " + c)
297
298 i = self.HTTP_ERROR_RETRY
299 while i > 0 :
300 # wait randomly to avoid overloading website
301 time.sleep(random.uniform(self.CONF_RANDOM_MIN, self.CONF_RANDOM_MAX))
302
303 try:
304 sock=urllib2.urlopen(self.CONF_URL + xmlfile)
305 data=sock.read()
306
307 except IOError, e:
308 serr="unknown"
309 if hasattr(e, 'reason'):
310 serr=str(e.reason)
311 elif hasattr(e, 'code'):
312 serr=str(e.code)
313 if hasattr(e, 'msg'):
314 serr+=" , "+str(e.msg)
315
316 self.log("\'%s\' connection error. Reason: %s. Waiting %d sec. and retry [%d] ..." % (self.CONF_URL + xmlfile, serr, self.HTTP_ERROR_WAIT_RETRY, i))
317 time.sleep(self.HTTP_ERROR_WAIT_RETRY) # add sleep
318 i -= 1
319
320 else:
321 i = 0 # force quit WHILE loop
322 sock.close()
323
324 dtparser = Titolo_parser(day)
325 dtparser.parse(data)
326 self.guida = self.guidatomorrow
327 (self.guidatoday, self.guidatomorrow) = dtparser.get_guida()
328
329 # if no data, quit for loop and stop downloading
330 if len(self.guidatoday) == 0:
331 exit_for_loop = True
332 break
333
334 self.guida = self.guida + self.guidatoday
335
336 self.log(" writing in cache \'%s\'" % eventfilename)
337 # write data in cache file using UTF-8 encoding
338 fd = codecs.open(eventfilepath, "w", 'utf-8')
339 fd.write(str(c) + self.FIELD_SEPARATOR + channel_name + self.FIELD_SEPARATOR + channel_provider + self.FIELD_SEPARATOR + day + '\n')
340 fd.write("Local Time (human readeable)###Unix GMT Time###Event Title###Event Description\n")
341
342 # extract all events and put in eventfile
343 for event in self.guida:
344 (dataora,titolo) = event
345 event_starttime = dataora
346 # time.mktime return Unix time inside GMT timezone
347 event_startime_unix_gmt = str(int(time.mktime(time.strptime(event_starttime,"%Y-%m-%d %H:%M"))) - self.DELTA_UTC )
348 #event_startime_unix_gmt = str(int(time.mktime(time.strptime(event_starttime,"%Y-%m-%d %H:%M"))) )
349 #self.log(event_starttime + " , " + str(self.DELTA_UTC) + " , " + str(int(time.mktime(time.strptime(event_starttime,"%Y-%m-%d %H:%M")))) + " , " + event_startime_unix_gmt )
350
351 # convert remote data (RAI website use UTF-8) in Python Unicode (UCS2)
352 event_title = unicode(titolo,self.REMOTE_EPG_CHARSET)
353
354 event_title = event_title.replace('\r','')
355 event_title = event_title.replace('\n',u' ')
356 event_title = event_title.strip(u' ')
357
358 event_description = u''
359
360 fd.write(event_starttime + self.FIELD_SEPARATOR + event_startime_unix_gmt + self.FIELD_SEPARATOR + event_title + self.FIELD_SEPARATOR + event_description + '\n')
361
362 fd.close()
363
364
365# ----------------------------------------------------------------------
366
367
368 def process_cache(self):
369 self.log("--- START PROCESSING CACHE ---")
370 self.log2video("START PROCESSING CACHE")
371 if not os.path.exists(self.CONF_CACHEDIR):
372 self.log("ERROR: %s not present" % self.CONF_CACHEDIR,1)
373 sys.exit(1)
374
375 self.log("Loading lamedb")
376 lamedb = scriptlib.lamedb_class()
377
378 self.log("Initialize CrossEPG database")
379 crossdb = scriptlib.crossepg_db_class()
380 crossdb.open_db()
381
382 events = []
383 previous_id = ''
384 channels_name = ''
385 total_events = 0
386
387 self.log("Start data processing")
388 filelist = sorted(os.listdir(self.CONF_CACHEDIR))
389 filelist.append('***END***')
390
391 for f in filelist :
392 id = f.split(self.FIELD_SEPARATOR)[0]
393 if previous_id == '':
394 previous_id = id
395
396 if id != previous_id :
397 total_events += len(events)
398 self.log(" ...processing \'%s\' , nr. events %d" % (previous_id,len(events)))
399 self.log2video("processed %d events ..." % total_events )
400
401 for c in channels_name:
402 # a channel can have zero or more SID (different channel with same name)
403 # return the list [0e1f:00820000:0708:00c8:1:0 , 1d20:00820000:2fa8:013e:1:0 , ..... ]
404 # return [] if channel name is not in lamedb
405 sidbyname = lamedb.get_sid_byname(c.strip(' \n').lower())
406
407 # process every SID
408 for s in sidbyname:
409 # convert "0e1f:00820000:0708:00c8:1:0" to sid,tsid,onid
410 # return the list [sid,tsid,onid]
411 ch_sid = lamedb.convert_sid(s)
412 if len(ch_sid) == 0:
413 continue
414
415 # add channel into db
416 # doesn't matter if the channel already exist... epgdb do all the work
417 crossdb.add_channel(ch_sid)
418
419 i = 0
420 L = len(events) - 1
421
422 # process events
423 for e in events:
424
425 e_starttime = int(e.split(self.FIELD_SEPARATOR)[1])
426
427 if i < L :
428 e_length = int(events[i+1].split(self.FIELD_SEPARATOR)[1]) - e_starttime
429 else:
430 # last event, dummy length 90 min.
431 e_length = 5400
432 i += 1
433
434 # extract title and encode Python Unicode with UTF-8
435 e_title = e.split(self.FIELD_SEPARATOR)[2].encode('utf-8')
436
437 # RAI website HAVE NOT long description. (bleah !).
438 e_summarie = u' '
439 # encode Python Unicode in UTF-8
440 e_summarie = e_summarie.encode('utf-8')
441
442 # add_event(start_time , duration , title , summarie , ISO639_language_code , strings_encoded_with_UTF-8)
443 crossdb.add_event(e_starttime, e_length, e_title, e_summarie, 'ita', True )
444
445 if f == '***END***':
446 break
447
448 events = []
449 previous_id = id
450 channels_name = ''
451
452 if id == previous_id:
453 self.log("Reading \'%s\'" % f)
454 # read events from cache file using UTF-8 and insert them in events list
455 fd = codecs.open(os.path.join(self.CONF_CACHEDIR, f),"r","utf-8")
456 lines = fd.readlines()
457 fd.close()
458 if channels_name == '':
459 # first line has channel data (id,name,provider,date)
460 channels_name = lines[0].split(self.FIELD_SEPARATOR)[1].split('|')
461 # the second line is only a remark
462 # add events starting from third line
463 events.extend(lines[2:])
464
465 # end process, close CrossEPG DB saving data
466 crossdb.close_db()
467 self.log("TOTAL EPG EVENTS PROCESSED: %d" % total_events)
468 self.log("--- END ---")
469 self.log2video("END , events processed: %d" % total_events)
470
471
472
473# ****************************************************************************************************************************
474
475# MAIN CODE: SCRIPT START HERE
476
477SCRIPT_DIR = 'scripts/rai/'
478
479# get CrossEPG installation dir.
480crossepg_instroot = crossepg.epgdb_get_installroot()
481if crossepg_instroot == False:
482 sys.exit(1)
483scriptlocation = os.path.join(crossepg_instroot , SCRIPT_DIR)
484
485# get where CrossEPG save data (dbroot) and use it as script cache repository
486crossepg_dbroot = crossepg.epgdb_get_dbroot()
487if crossepg_dbroot == False:
488 sys.exit(1)
489
490# initialize script class
491script_class = main(scriptlocation , crossepg_dbroot)
492
493# download data and cache them
494script_class.download_and_cache()
495
496# read cached data and inject into CrossEPG database
497script_class.process_cache()
498
Note: See TracBrowser for help on using the repository browser.