source: ipk/source/epg_crossepg/var/crossepg/scripts/mediaprem/mediaprem.py@ 17993

Last change on this file since 17993 was 7451, checked in by BPanther, 15 years ago

[ipk] - copy source->source.sh4

File size: 17.5 KB
Line 
1#!/usr/bin/python
2# mediaprem.py by Ambrosa http://www.ambrosa.net
3# this module is used for download EPG data from Mediaset website
4# derived from E2_LOADEPG
5
6__author__ = "ambrosa http://www.ambrosa.net"
7__copyright__ = "Copyright (C) 2008-2011 Alessandro Ambrosini"
8__license__ = "CreativeCommons by-nc-sa http://creativecommons.org/licenses/by-nc-sa/3.0/"
9
10import gc
11import os
12import sys
13import time
14import codecs
15import socket
16import urllib
17import urllib2
18import ConfigParser
19#from xml.dom import minidom
20
21# import CrossEPG functions
22import crossepg
23
24# location of local python modules under "scripts/lib" dir.
25# add it to sys.path()
26crossepg_instroot = crossepg.epgdb_get_installroot()
27if crossepg_instroot == False:
28 sys.exit(1)
29libdir = os.path.join(crossepg_instroot , 'scripts/lib')
30sys.path.append(libdir)
31
32# import local modules
33import sgmllib
34import scriptlib
35
36# =================================================================
37# HTML PARSER used for parsing description
38
39
40class Description_parser(sgmllib.SGMLParser):
41 def parse(self, s):
42 self.feed(s)
43 self.close()
44
45 def __init__(self, verbose=0):
46 sgmllib.SGMLParser.__init__(self, verbose)
47 self.start_div_box = False
48 self.start_div_boxtxt = False
49 self.description = ''
50
51
52 def start_div(self, attributes):
53 for name, value in attributes:
54 if name == "class":
55 if value == "box_Text":
56 self.start_div_box = True
57 elif value == "txtBox_cms":
58 self.start_div_boxtxt = True
59
60 def end_div(self):
61 if self.start_div_boxtxt == True:
62 self.start_div_box = False
63 self.start_div_boxtxt = False
64
65
66 def handle_data(self, data):
67 if self.start_div_boxtxt == True:
68 self.description += data.decode('iso-8859-1')
69
70 def get_descr(self):
71 return (self.description.strip(' \n\r') )
72
73
74
75# =================================================================
76
77
78class main(sgmllib.SGMLParser):
79
80 # main config file
81 CONF_CONFIGFILENAME = "mediaprem.conf"
82
83 # Network socket timeout (in seconds)
84 CONF_SOCKET_TIMEOUT = 20
85
86 # log text
87 CONF_LOG_SCRIPT_NAME = "MediasetPremium (Italy)"
88 CONF_LOG_PREFIX = ""
89
90 # max chars in description
91 CONF_DLDESCMAXCHAR = 250
92
93 # retry number if HTTP error
94 HTTP_ERROR_RETRY = 3
95 # seconds to wait between retries
96 HTTP_ERROR_WAIT_RETRY = 5
97
98 # charset used in remote website epg data
99 REMOTE_EPG_CHARSET = 'utf-8'
100
101 TODAYMP = ''
102 DAYCACHEMP = []
103 FIELD_SEPARATOR = '###'
104 CHANNELLIST = {}
105
106 DESCRIPTIONS_WEBCACHE = {}
107
108
109# -------- xml processing using SGMLLIB -----------
110# best way is use xml.minidom but it's very memory hungry (about 40MB memory for 2 MB XML file)
111# sgmllib can simple parse xml data
112 SGML_PALINSESTO_INSIDE = False
113 SGML_TITOLO_INSIDE = False
114 SGML_LINKSCHEDA_INSIDE = False
115
116 SGML_GIORNOMP = None
117 SGML_CHID = None
118 SGML_FD = None
119 SGML_TOTAL_EVENTS = 0
120
121 SGML_EVENT_STARTHOUR = None
122 SGML_EVENT_TITLE = None
123 SGML_EVENT_SUMMARIE = None
124
125 def parse(self, s):
126 self.feed(s)
127 self.close()
128
129 def start_palinsesto(self, attr):
130 self.SGML_PALINSESTO_INSIDE = True
131
132 def end_palinsesto(self):
133 self.SGML_PALINSESTO_INSIDE = False
134 self.SGML_GIORNOMP = None
135 self.log("extracted %d events" % self.SGML_TOTAL_EVENTS)
136
137 def start_giorno(self,attr):
138 if self.SGML_PALINSESTO_INSIDE == True :
139 self.SGML_GIORNOMP = None
140 for name,value in attr:
141 if name == "data":
142 if str(value).strip(' \n\r') in self.DAYCACHEMP :
143 self.SGML_GIORNOMP = str(value).strip(' \n\r')
144 break
145
146 def end_giorno(self):
147 self.SGML_GIORNOMP = None
148
149 def start_canale(self,attr):
150 if self.SGML_GIORNOMP != None:
151 for name,value in attr:
152 if name == "id":
153 self.SGML_CHID = str(value).strip(' \n\r').lower()
154
155 if not self.CHANNELLIST.has_key(self.SGML_CHID) :
156 self.log("Warning: new channel id=%s found in XML data" % self.SGML_CHID )
157 break
158
159 # get cache option
160 # 0 : don't download/cache
161 # 1 : download and cache (optional 1,new_name )
162 # 2 : always download overwriting existing files (optional 2,new_name )
163 # 3 : always download overwriting existing files only for TODAY (optional 3,new_name )
164
165 cacheopt = int(self.CHANNELLIST[self.SGML_CHID].split(",")[0])
166
167 # if cacheopt == 0, do nothing
168 if cacheopt == 0:
169 break
170
171 channel_name = ''
172 if len(self.CHANNELLIST[self.SGML_CHID].split(",")) > 1 :
173 if self.CHANNELLIST[self.SGML_CHID].split(",")[1] != '' :
174 # channel renamed, new name provided by user
175 channel_name = self.CHANNELLIST[self.SGML_CHID].split(",")[1].strip(' \n\r').lower()
176
177 # if channel name is not present as option, quit with error
178 if channel_name == '':
179 self.log("ERROR ! ID=%s channel name not present" % self.SGML_CHID)
180 sys.exit(1)
181
182 channel_provider = self.CONF_DEFAULT_PROVIDER
183 if len(self.CHANNELLIST[self.SGML_CHID].split(",")) > 2 :
184 if self.CHANNELLIST[self.SGML_CHID].split(",")[2] != '' :
185 channel_provider = self.CHANNELLIST[self.SGML_CHID].split(",")[2].strip(' \n\r').lower()
186
187 # if channel name is not present as option in channel_list.conf , quit with error
188 if channel_name == '':
189 self.log("ERROR ! ID=" + self.SGML_CHID + " channel name not present. Skip !")
190 break
191
192 day = str(self.convert_daymp(self.SGML_GIORNOMP))
193 eventfilename = scriptlib.fn_escape(self.SGML_CHID + self.FIELD_SEPARATOR + channel_name + self.FIELD_SEPARATOR + day)
194 eventfilepath = os.path.join(self.CONF_CACHEDIR, eventfilename)
195 if (cacheopt == 1) and os.path.exists(eventfilepath):
196 break
197 if (cacheopt == 3) and os.path.exists(eventfilepath) and (self.SGML_GIORNOMP != self.TODAYMP):
198 break
199 if (cacheopt != 1) and (cacheopt != 2) and (cacheopt != 3):
200 self.log("Warning: unknown cache option " + str(cacheopt))
201 break
202
203 self.log(" Writing in cache \'" + eventfilename + "\'",2)
204 self.log2video(" extracting \"%s\" (%s)" % (channel_name, day))
205
206 self.SGML_FD = codecs.open(eventfilepath,"w",'utf-8')
207
208 self.SGML_FD.write(self.SGML_CHID + self.FIELD_SEPARATOR + channel_name + self.FIELD_SEPARATOR + channel_provider + self.FIELD_SEPARATOR + day + '\n')
209 self.SGML_FD.write("Local Time (human readeable)###Unix GMT Time###Event Title###Event Description\n")
210
211 break
212
213 def end_canale(self):
214 if self.SGML_FD != None:
215 self.SGML_FD.close()
216 self.SGML_FD = None
217 self.SGML_CHID = None
218
219 def start_prg(self,attr):
220 if self.SGML_FD != None :
221 self.SGML_EVENT_STARTHOUR = None
222 for name,value in attr:
223 if name == "orainizio":
224 self.SGML_EVENT_STARTHOUR = str(value).strip(' \n\r')
225 break
226
227 def end_prg(self):
228 if self.SGML_FD != None :
229
230 if (self.SGML_EVENT_STARTHOUR >='00:00') and (self.SGML_EVENT_STARTHOUR <= '05:59') :
231 nextdayevent = 86400
232 else:
233 nextdayevent = 0
234
235 event_starttime = self.SGML_GIORNOMP + '_' + self.SGML_EVENT_STARTHOUR
236 event_startime_unix_gmt = str(int(time.mktime(time.strptime(event_starttime,"%Y/%m/%d_%H:%M"))) - self.DELTA_UTC + nextdayevent)
237
238 event_title = unicode(self.SGML_EVENT_TITLE)
239 event_title = event_title.replace('\r','')
240 event_title = event_title.replace('\n','')
241 event_title = event_title.strip(u' ')
242
243 event_description = ''
244 if self.CONF_DL_DESC == 1 :
245 event_description = unicode(self.get_description(self.SGML_EVENT_SUMMARIE_LINK.strip(' \n\r'), self.CONF_DLDESCMAXCHAR) )
246 event_description = event_description.replace('\r','')
247 event_description = event_description.replace('\n',u' ')
248 event_description = event_description.strip(u' ')
249
250 self.SGML_FD.write(event_starttime + self.FIELD_SEPARATOR + event_startime_unix_gmt + self.FIELD_SEPARATOR + event_title + self.FIELD_SEPARATOR + event_description + '\n')
251 self.SGML_TOTAL_EVENTS += 1
252
253 def start_titolo(self,attr):
254 if self.SGML_FD != None:
255 self.SGML_TITOLO_INSIDE = True
256
257 def end_titolo(self):
258 if self.SGML_FD != None:
259 self.SGML_TITOLO_INSIDE = False
260
261 def start_linkscheda(self,attr):
262 if self.SGML_FD != None:
263 self.SGML_LINKSCHEDA_INSIDE = True
264
265 def end_linkscheda(self):
266 if self.SGML_FD != None:
267 self.SGML_LINKSCHEDA_INSIDE = False
268
269
270 def handle_data(self, data):
271 if self.SGML_TITOLO_INSIDE == True:
272 self.SGML_EVENT_TITLE = data.encode('utf-8')
273 self.SGML_EVENT_TITLE = self.SGML_EVENT_TITLE.strip(' \n\r')
274
275 if self.SGML_LINKSCHEDA_INSIDE == True:
276 self.SGML_EVENT_SUMMARIE_LINK = data.encode('utf-8')
277 self.SGML_EVENT_SUMMARIE_LINK = self.SGML_EVENT_SUMMARIE_LINK.strip(' \n\r')
278
279
280# -----------------------------------------------
281
282 def log(self,s,video=0):
283 self.logging.log(self.CONF_LOG_PREFIX + str(s))
284 if video == 1:
285 self.log2video(str(s))
286
287 def log2video(self,s):
288 self.logging.log2video_status(str(s))
289
290 def convert_daymp(self,dmp):
291 daystandard = time.strftime("%Y%m%d",time.strptime(dmp,"%Y/%m/%d"))
292 return daystandard
293
294
295 def get_description(self,url,maxchar=128):
296
297 if url[:7] != 'http://':
298 return('')
299
300 if (url[-5:] != '.html') and (url[-4:] != '.htm') :
301 return('')
302
303 url_hash = hash(url)
304 if self.DESCRIPTIONS_WEBCACHE.has_key(url_hash):
305 self.log(" cached description " + url)
306 return(self.DESCRIPTIONS_WEBCACHE[url_hash])
307
308 self.log(" downloading description " + url )
309 url_enc = str(urllib.quote(url,safe=":/"))
310 try:
311 sock = urllib2.urlopen(url_enc)
312 data = sock.read()
313 except IOError, e:
314 serr = "unknown"
315 if hasattr(e, 'reason'):
316 serr = str(e.reason)
317 elif hasattr(e, 'code'):
318 serr = str(e.code)
319 if hasattr(e, 'msg'):
320 serr += " , " + str(e.msg)
321
322 self.log(" error, reason: " + serr + ". Skip it.")
323 return('')
324
325 else:
326 sock.close()
327 dsparser = Description_parser()
328 dsparser.parse(data)
329 self.DESCRIPTIONS_WEBCACHE[url_hash] = dsparser.get_descr()[:maxchar]
330 return(self.DESCRIPTIONS_WEBCACHE[url_hash])
331
332 return('')
333
334
335
336 def __init__(self, confdir, dbroot):
337
338 # initialize SGMLLIB
339 sgmllib.SGMLParser.__init__(self, 0)
340
341 # initialize logging
342 self.logging = scriptlib.logging_class()
343 # write to video OSD the script name
344 self.logging.log2video_scriptname(self.CONF_LOG_SCRIPT_NAME)
345
346 CONF_FILE = os.path.join(confdir,self.CONF_CONFIGFILENAME)
347 if not os.path.exists(CONF_FILE) :
348 self.log("ERROR: %s not present" % CONF_FILE,1)
349 sys.exit(1)
350
351 config = ConfigParser.ConfigParser()
352 #config.optionxform = str # needed to return case sensitive index
353 config.read(CONF_FILE)
354
355 # reading [global] section options
356 self.CONF_DEFAULT_PROVIDER = config.get("global","DEFAULT_PROVIDER")
357 # save cache under dbroot
358 self.CONF_CACHEDIR = os.path.join(dbroot,config.get("global","CACHE_DIRNAME"))
359
360 self.CONF_DL_DESC = config.getint("global","DL_DESC")
361 self.CONF_MAX_DAY_EPG = config.getint("global","MAX_DAY_EPG")
362 self.CONF_URL = config.get("global","URL")
363
364 self.CONF_GMT_ZONE = config.get("global","GMT_ZONE")
365 if self.CONF_GMT_ZONE.strip(' ').lower() == 'equal':
366 #self.DELTA_UTC = -scriptlib.delta_utc() # return negative if timezone is east of GMT (like Italy), invert sign
367 self.DELTA_UTC = 0
368 else:
369 self.DELTA_UTC = float(self.CONF_GMT_ZONE)*3600.0
370 if self.DELTA_UTC >= 0:
371 self.DELTA_UTC = self.DELTA_UTC + scriptlib.delta_dst()
372 else:
373 self.DELTA_UTC = self.DELTA_UTC - scriptlib.delta_dst()
374
375 self.DELTA_UTC = int(self.DELTA_UTC)
376 #self.log("Website timezone - UTC = %d seconds" % self.DELTA_UTC)
377
378 if not os.path.exists(self.CONF_CACHEDIR):
379 self.log("Creating \'%s\' directory for caching" % self.CONF_CACHEDIR)
380 os.mkdir(self.CONF_CACHEDIR)
381
382 # reading [channels] section
383 temp = config.items("channels");
384
385 # create a dictionary (Python array) with index = channel ID
386 for i in temp:
387 self.CHANNELLIST[i[0].strip(' \n\r').lower()] = unicode(i[1].strip(' \n\r').lower(),'utf-8')
388
389 if len(self.CHANNELLIST) == 0 :
390 self.log("ERROR: [channels] section empty ?",1)
391 sys.exit(1)
392
393 # set network socket timeout
394 socket.setdefaulttimeout(self.CONF_SOCKET_TIMEOUT)
395
396 self.TODAYMP = time.strftime("%Y/%m/%d")
397 # create a list filled with dates (format AAAA/MM/DD) from today to today+ MAX_DAY_EPG
398 self.DAYCACHEMP=[]
399 for day in range(0,self.CONF_MAX_DAY_EPG):
400 self.DAYCACHEMP.append(time.strftime("%Y/%m/%d",time.localtime(time.time()+86400*day)))
401
402
403
404# ----------------------------------------------------------------------
405
406
407 def download_and_cache(self):
408 self.log("--- START DOWNLOAD AND CACHE DATA ---")
409 self.log2video("STARTING DOWNLOAD")
410
411 self.log("Removing old cached files")
412 scriptlib.cleanup_oldcachedfiles(self.CONF_CACHEDIR, self.FIELD_SEPARATOR)
413
414
415 self.log("Start download XML data from \'" + self.CONF_URL+"\'")
416 self.log2video("downloading XML data ...")
417
418 i = self.HTTP_ERROR_RETRY
419 while i > 0:
420 try:
421 sock = urllib2.urlopen(self.CONF_URL)
422 data = sock.read()
423 except IOError, e:
424 serr = "unknown"
425 if hasattr(e, 'reason'):
426 serr = str(e.reason)
427 elif hasattr(e, 'code'):
428 serr = str(e.code)
429 if hasattr(e, 'msg'):
430 serr += " , " + str(e.msg)
431
432 self.log("\'" + self.CONF_URL + "\' connection error. Reason: "+serr+". Waiting "+str(self.HTTP_ERROR_WAIT_RETRY)+" sec. and retry ["+str(i)+"] ...")
433 time.sleep(self.HTTP_ERROR_WAIT_RETRY) # add sleep
434 i -= 1
435
436 else:
437 i = -99
438 sock.close()
439
440 if (i != -99):
441 self.log("Cannot retrieve data from \'" + self.CONF_URL + "\'. Abort script")
442 self.log2video("Error: cannot download XML data, abort")
443 time.sleep(5)
444 sys.exit(1)
445
446 self.log("end download XML data, now processing")
447 self.log2video("processing XML data, wait ...")
448
449 # start SGMLLIB parsing
450 self.parse(data)
451
452 self.log("end process XML data",1)
453
454# ----------------------------------------------------------------------
455
456
457 def process_cache(self):
458 self.log("--- START PROCESSING CACHE ---")
459 self.log2video("START PROCESSING CACHE")
460 if not os.path.exists(self.CONF_CACHEDIR):
461 self.log("ERROR: %s not present" % self.CONF_CACHEDIR,1)
462 sys.exit(1)
463
464 self.log("Loading lamedb")
465 lamedb = scriptlib.lamedb_class()
466
467 self.log("Initialize CrossEPG database")
468 crossdb = scriptlib.crossepg_db_class()
469 crossdb.open_db()
470
471 events = []
472 previous_id = ''
473 channels_name = ''
474 total_events = 0
475
476 self.log("Start data processing")
477 filelist = sorted(os.listdir(self.CONF_CACHEDIR))
478 filelist.append('***END***')
479
480 for f in filelist :
481 id = f.split(self.FIELD_SEPARATOR)[0]
482 if previous_id == '':
483 previous_id = id
484
485 if id != previous_id :
486 total_events += len(events)
487 self.log(" ...processing \'%s\' , nr. events %d" % (previous_id,len(events)))
488 self.log2video("processed %d events ..." % total_events )
489
490 for c in channels_name:
491 # a channel can have zero or more SID (different channel with same name)
492 # return the list [0e1f:00820000:0708:00c8:1:0 , 1d20:00820000:2fa8:013e:1:0 , ..... ]
493 # return [] if channel name is not in lamedb
494 sidbyname = lamedb.get_sid_byname(c.strip(' \n').lower())
495
496 # process every SID
497 for s in sidbyname:
498 # convert "0e1f:00820000:0708:00c8:1:0" to sid,tsid,onid
499 # return the list [sid,tsid,onid]
500 ch_sid = lamedb.convert_sid(s)
501 if len(ch_sid) == 0:
502 continue
503
504 # add channel into db
505 # doesn't matter if the channel already exist... epgdb do all the work
506 crossdb.add_channel(ch_sid)
507
508 i = 0
509 L = len(events) - 1
510
511 # process events
512 for e in events:
513
514 items = e.split(self.FIELD_SEPARATOR)
515 e_starttime = int(items[1])
516
517 if i < L :
518 e_length = int(events[i+1].split(self.FIELD_SEPARATOR)[1]) - e_starttime
519 else:
520 # last event, dummy length 90 min.
521 e_length = 5400
522 i += 1
523
524 # extract title and encode Python Unicode with UTF-8
525 e_title = items[2].encode('utf-8')
526
527 # extract summarie and encode Python Unicode with UTF-8
528 e_summarie = items[3].encode('utf-8')
529
530 # add_event(start_time , duration , title , summarie , ISO639_language_code , strings_encoded_with_UTF-8)
531 crossdb.add_event(e_starttime, e_length, e_title, e_summarie, 'ita', True )
532
533 if f == '***END***':
534 break
535
536 events = []
537 previous_id = id
538 channels_name = ''
539
540 if id == previous_id:
541 self.log("Reading \'%s\'" % f)
542 # read events from cache file using UTF-8 and insert them in events list
543 fd = codecs.open(os.path.join(self.CONF_CACHEDIR, f),"r","utf-8")
544 lines = fd.readlines()
545 fd.close()
546 if channels_name == '':
547 # first line has channel data (id,name,provider,date)
548 channels_name = lines[0].split(self.FIELD_SEPARATOR)[1].split('|')
549 # the second line is only a remark
550 # add events starting from third line
551 events.extend(lines[2:])
552
553 # end process, close CrossEPG DB saving data
554 crossdb.close_db()
555 self.log("TOTAL EPG EVENTS PROCESSED: %d" % total_events)
556 self.log("--- END ---")
557 self.log2video("END , events processed: %d" % total_events)
558
559
560
561# ****************************************************************************************************************************
562
563# MAIN CODE: SCRIPT START HERE
564
565# increase this process niceness (other processes have higher priority)
566os.nice(10)
567
568# set Garbage Collector to do a "generational jump" more frequently than default 700
569# memory saving: about 50% (!!), some performance loss (obviously)
570gc.set_threshold(50,10,10)
571
572SCRIPT_DIR = 'scripts/mediaprem/'
573
574# get CrossEPG installation dir.
575crossepg_instroot = crossepg.epgdb_get_installroot()
576if crossepg_instroot == False:
577 sys.exit(1)
578scriptlocation = os.path.join(crossepg_instroot , SCRIPT_DIR)
579
580# get where CrossEPG save data (dbroot) and use it as script cache repository
581crossepg_dbroot = crossepg.epgdb_get_dbroot()
582if crossepg_dbroot == False:
583 sys.exit(1)
584
585# initialize script class
586script_class = main(scriptlocation , crossepg_dbroot)
587
588# download data and cache them
589script_class.download_and_cache()
590
591# read cached data and inject into CrossEPG database
592script_class.process_cache()
593
Note: See TracBrowser for help on using the repository browser.