source: ipk/source/epg_crossepg/var/crossepg/scripts/mediaprem/example-mediaprem-minidom.py@ 19259

Last change on this file since 19259 was 7451, checked in by BPanther, 15 years ago

[ipk] - copy source->source.sh4

File size: 16.5 KB
Line 
1#!/usr/bin/python
2# mediaprem.py by Ambrosa http://www.ambrosa.net
3# this module is used for download EPG data from Mediaset website
4# derived from E2_LOADEPG
5
6__author__ = "ambrosa http://www.ambrosa.net"
7__copyright__ = "Copyright (C) 2008-2011 Alessandro Ambrosini"
8__license__ = "CreativeCommons by-nc-sa http://creativecommons.org/licenses/by-nc-sa/3.0/"
9
10import gc
11import os
12import sys
13import time
14import codecs
15import socket
16import urllib
17import urllib2
18import ConfigParser
19from xml.dom import minidom
20
21# import CrossEPG functions
22import crossepg
23
24# location of local python modules under "scripts/lib" dir.
25# add it to sys.path()
26crossepg_instroot = crossepg.epgdb_get_installroot()
27if crossepg_instroot == False:
28 sys.exit(1)
29libdir = os.path.join(crossepg_instroot , 'scripts/lib')
30sys.path.append(libdir)
31
32# import local modules
33import sgmllib
34import scriptlib
35
36# =================================================================
37# HTML PARSER
38
39
40class Description_parser(sgmllib.SGMLParser):
41 def parse(self, s):
42 self.feed(s)
43 self.close()
44
45 def __init__(self, verbose=0):
46 sgmllib.SGMLParser.__init__(self, verbose)
47 self.start_div_box = False
48 self.start_div_boxtxt = False
49 self.description = ''
50
51
52 def start_div(self, attributes):
53 for name, value in attributes:
54 if name == "class":
55 if value == "box_Text":
56 self.start_div_box = True
57 elif value == "txtBox_cms":
58 self.start_div_boxtxt = True
59
60 def end_div(self):
61 if self.start_div_boxtxt == True:
62 self.start_div_box = False
63 self.start_div_boxtxt = False
64
65
66 def handle_data(self, data):
67 if self.start_div_boxtxt == True:
68 self.description += data.decode('iso-8859-1')
69
70 def get_descr(self):
71 return (self.description.strip(' \n\r') )
72
73# =================================================================
74
75
76class main:
77
78 # main config file
79 CONF_CONFIGFILENAME = "mediaprem.conf"
80
81 # Network socket timeout (in seconds)
82 CONF_SOCKET_TIMEOUT = 20
83
84 # log text
85 CONF_LOG_SCRIPT_NAME = "MediasetPremium (Italy)"
86 CONF_LOG_PREFIX = ""
87
88 # max chars in description
89 CONF_DLDESCMAXCHAR = 250
90
91 # retry number if HTTP error
92 HTTP_ERROR_RETRY = 3
93 # seconds to wait between retries
94 HTTP_ERROR_WAIT_RETRY = 5
95
96 # charset used in remote website epg data
97 REMOTE_EPG_CHARSET = 'utf-8'
98
99 TODAYMP = ''
100 DAYCACHEMP = []
101 FIELD_SEPARATOR = '###'
102 CHANNELLIST = {}
103
104
105 def log(self,s,video=0):
106 self.logging.log(self.CONF_LOG_PREFIX + str(s))
107 if video == 1:
108 self.log2video(str(s))
109
110 def log2video(self,s):
111 self.logging.log2video_status(str(s))
112
113 def convert_daymp(self,dmp):
114 daystandard = time.strftime("%Y%m%d",time.strptime(dmp,"%Y/%m/%d"))
115 return daystandard
116
117
118 def get_description(self,url):
119
120 if url[:7] != 'http://':
121 return('')
122
123 if (url[-5:] != '.html') and (url[-4:] != '.htm') :
124 return('')
125
126 self.log(" downloading description \'" + url + "\'")
127 url = str(urllib.quote(url,safe=":/"))
128
129 try:
130 sock = urllib2.urlopen(url)
131 data = sock.read()
132 except IOError, e:
133 serr = "unknown"
134 if hasattr(e, 'reason'):
135 serr = str(e.reason)
136 elif hasattr(e, 'code'):
137 serr = str(e.code)
138 if hasattr(e, 'msg'):
139 serr += " , " + str(e.msg)
140
141 self.log(url + " error, reason: " + serr + ". Skip it.")
142 return('')
143
144 else:
145 sock.close()
146 dsparser = Description_parser()
147 dsparser.parse(data)
148 return(dsparser.get_descr())
149
150 return('')
151
152
153
154 def __init__(self,confdir,dbroot):
155
156 # initialize logging
157 self.logging = scriptlib.logging_class()
158 # write to video OSD the script name
159 self.logging.log2video_scriptname(self.CONF_LOG_SCRIPT_NAME)
160
161
162 # check swap memory available
163 osp = os.popen('free | awk \'/Swap/ { print $2 }\'','r')
164 ret = osp.readlines()
165 if len(ret) > 0:
166 try:
167 m = int(ret[0])/1024
168 except:
169 self.log("Error get SWAP value, abort",1)
170 time.sleep(10)
171 sys.exit(1)
172
173 if m < 60:
174 self.log("SWAP Not Enabled (<60MB), abort",1)
175 time.sleep(10)
176 sys.exit(1)
177 else:
178 self.log("Error get SWAP value, abort",1)
179 time.sleep(10)
180 sys.exit(1)
181
182 osp.close()
183
184
185 CONF_FILE = os.path.join(confdir,self.CONF_CONFIGFILENAME)
186 if not os.path.exists(CONF_FILE) :
187 self.log("ERROR: %s not present" % CONF_FILE,1)
188 sys.exit(1)
189
190 config = ConfigParser.ConfigParser()
191 #config.optionxform = str # needed to return case sensitive index
192 config.read(CONF_FILE)
193
194 # reading [global] section options
195 self.CONF_DEFAULT_PROVIDER = config.get("global","DEFAULT_PROVIDER")
196 # save cache under dbroot
197 self.CONF_CACHEDIR = os.path.join(dbroot,config.get("global","CACHE_DIRNAME"))
198
199 self.CONF_DL_DESC = config.getint("global","DL_DESC")
200 self.CONF_MAX_DAY_EPG = config.getint("global","MAX_DAY_EPG")
201 self.CONF_URL = config.get("global","URL")
202
203 self.CONF_GMT_ZONE = config.get("global","GMT_ZONE")
204 if self.CONF_GMT_ZONE.strip(' ').lower() == 'equal':
205 #self.DELTA_UTC = -scriptlib.delta_utc() # return negative if timezone is east of GMT (like Italy), invert sign
206 self.DELTA_UTC = 0
207 else:
208 self.DELTA_UTC = float(self.CONF_GMT_ZONE)*3600.0
209 if self.DELTA_UTC >= 0:
210 self.DELTA_UTC = self.DELTA_UTC + scriptlib.delta_dst()
211 else:
212 self.DELTA_UTC = self.DELTA_UTC - scriptlib.delta_dst()
213
214 self.DELTA_UTC = int(self.DELTA_UTC)
215 #self.log("Website timezone - UTC = %d seconds" % self.DELTA_UTC)
216
217 if not os.path.exists(self.CONF_CACHEDIR):
218 self.log("Creating \'%s\' directory for caching" % self.CONF_CACHEDIR)
219 os.mkdir(self.CONF_CACHEDIR)
220
221 # reading [channels] section
222 temp = config.items("channels");
223
224 # create a dictionary (Python array) with index = channel ID
225 for i in temp:
226 self.CHANNELLIST[i[0].strip(' \n\r').lower()] = unicode(i[1].strip(' \n\r').lower(),'utf-8')
227
228 if len(self.CHANNELLIST) == 0 :
229 self.log("ERROR: [channels] section empty ?",1)
230 sys.exit(1)
231
232 # set network socket timeout
233 socket.setdefaulttimeout(self.CONF_SOCKET_TIMEOUT)
234
235 self.TODAYMP = time.strftime("%Y/%m/%d")
236 # create a list filled with dates (format AAAA/MM/DD) from today to today+ MAX_DAY_EPG
237 self.DAYCACHEMP=[self.TODAYMP]
238 for day in range(1,self.CONF_MAX_DAY_EPG):
239 self.DAYCACHEMP.append(time.strftime("%Y/%m/%d",time.localtime(time.time()+86400*day)))
240
241
242
243# ----------------------------------------------------------------------
244
245
246 def download_and_cache(self):
247 self.log("--- START DOWNLOAD AND CACHE DATA ---")
248 self.log2video("STARTING DOWNLOAD")
249
250 self.log("Removing old cached files")
251 scriptlib.cleanup_oldcachedfiles(self.CONF_CACHEDIR, self.FIELD_SEPARATOR)
252
253 chlist = self.CHANNELLIST
254
255 self.log("Start download XML data from \'" + self.CONF_URL+"\'")
256 self.log2video("downloading XML data ...")
257
258 i = self.HTTP_ERROR_RETRY
259 while i > 0:
260 try:
261 sock = urllib2.urlopen(self.CONF_URL)
262 data = sock.read()
263 except IOError, e:
264 serr = "unknown"
265 if hasattr(e, 'reason'):
266 serr = str(e.reason)
267 elif hasattr(e, 'code'):
268 serr = str(e.code)
269 if hasattr(e, 'msg'):
270 serr += " , " + str(e.msg)
271
272 self.log("\'" + self.CONF_URL + "\' connection error. Reason: "+serr+". Waiting "+str(self.HTTP_ERROR_WAIT_RETRY)+" sec. and retry ["+str(i)+"] ...")
273 time.sleep(self.HTTP_ERROR_WAIT_RETRY) # add sleep
274 i -= 1
275
276 else:
277 i = -99
278 sock.close()
279
280 if (i != -99):
281 self.log("Cannot retrieve data from \'" + self.CONF_URL + "\'. Abort script")
282 self.log2video("Error: cannot download XML data, abort")
283 time.sleep(5)
284 sys.exit(1)
285
286 self.log("End download XML data, now processing XML code.")
287 self.log2video("preprocessing XML data, wait ...")
288 try:
289 xmldoc = minidom.parseString(data)
290 except:
291 self.log("Warning ! Data are not in a valid XML format. Abort script")
292 self.log2video("Error: no valid XML data, abort")
293 time.sleep(5)
294 sys.exit(1)
295
296
297 self.log("End process XML data")
298 self.log2video("end process XML data")
299
300 # days list
301 xmlref_giorno = xmldoc.getElementsByTagName('giorno')
302 for xml_gg in xmlref_giorno:
303 gg = xml_gg.attributes["data"].value
304 if gg not in self.DAYCACHEMP :
305 continue
306
307 xmlref_canale = xml_gg.getElementsByTagName('canale')
308 for xml_ch in xmlref_canale:
309 chid = xml_ch.attributes["id"].value.strip(' \n\r').lower()
310 if not chlist.has_key(chid) :
311 self.log("Warning: new channel \"id=%s name=%s\" found in XML data" % (xml_ch.attributes["id"].value,xml_ch.attributes["description"]))
312 continue
313
314 clist = [chid]
315 if self.CHANNELLIST.has_key(chid + '+1'):
316 clist.append(chid + '+1')
317
318 for c in clist:
319
320 # get cache option
321 # 0 : don't download/cache
322 # 1 : download and cache (optional 1,new_name )
323 # 2 : always download overwriting existing files (optional 2,new_name )
324 # 3 : always download overwriting existing files only for TODAY (optional 3,new_name )
325
326 cacheopt = int(chlist[c].split(",")[0])
327
328 # if cacheopt == 0, do nothing
329 if cacheopt == 0:
330 continue
331
332 channel_name = ''
333 if len(chlist[c].split(",")) > 1 :
334 if chlist[c].split(",")[1] != '' :
335 # channel renamed, new name provided by user
336 channel_name = chlist[c].split(",")[1].strip(' \n\r').lower()
337
338 # if channel name is not present as option, quit with error
339 if channel_name == '':
340 self.log("ERROR ! ID=%s channel name not present" % c)
341 sys.exit(1)
342
343 channel_provider = self.CONF_DEFAULT_PROVIDER
344 if len(chlist[c].split(",")) > 2 :
345 if chlist[c].split(",")[2] != '' :
346 channel_provider = chlist[c].split(",")[2].strip(' \n\r').lower()
347
348 # if channel name is not present as option in channel_list.conf , quit with error
349 if channel_name == '':
350 self.log("ERROR ! ID=" + str(c) + " channel name not present. Skip !")
351 continue
352
353 # download only if file doesn't exist or cacheopt == 2 (always download),
354 # using open(...,"w") files will be overwritten (saving a delete + create)
355
356 day = str(self.convert_daymp(gg))
357 eventfilename = scriptlib.fn_escape(str(c) + self.FIELD_SEPARATOR + channel_name + self.FIELD_SEPARATOR + day)
358 eventfilepath = os.path.join(self.CONF_CACHEDIR, eventfilename)
359 if (cacheopt == 1) and os.path.exists(eventfilepath):
360 continue
361 if (cacheopt == 3) and os.path.exists(eventfilepath) and (gg != self.TODAYMP):
362 continue
363 if (cacheopt != 1) and (cacheopt != 2) and (cacheopt != 3):
364 self.log("Warning: unknown cache option " + str(cacheopt))
365 exit_for_loop = True
366 continue
367
368 num_events = 0
369 self.log(" Writing in cache \'" + eventfilename + "\'",2)
370 self.log2video(" extracting \"%s\" [%d] (%s)" % (channel_name, num_events, day))
371
372 fd=codecs.open(eventfilepath,"w",'utf-8')
373
374 fd.write(str(c) + self.FIELD_SEPARATOR + channel_name + self.FIELD_SEPARATOR + channel_provider + self.FIELD_SEPARATOR + day + '\n')
375 fd.write("Local Time (human readeable)###Unix GMT Time###Event Title###Event Description\n")
376
377 xmlref_events = xml_ch.getElementsByTagName('prg')
378 for xml_ee in xmlref_events:
379 orainiz = xml_ee.attributes["orainizio"].value
380
381 if (orainiz >='00:00') and (orainiz <= '05:59') :
382 nextdayevent = 86400
383 else:
384 nextdayevent = 0
385
386 event_starttime = gg + " " + orainiz
387
388 if c == (chid + '+1'):
389 # manage channel "+1"
390 event_startime_unix_gmt = str(int(time.mktime(time.strptime(event_starttime,"%Y/%m/%d %H:%M"))) - self.DELTA_UTC + 3600 + nextdayevent)
391 else:
392 # normal channel, not "+1"
393 event_startime_unix_gmt = str(int(time.mktime(time.strptime(event_starttime,"%Y/%m/%d %H:%M"))) - self.DELTA_UTC + nextdayevent)
394
395
396 event_title = unicode(xml_ee.getElementsByTagName('titolo')[0].firstChild.data)
397 event_title = event_title.replace('\r','')
398 event_title = event_title.replace('\n','')
399 event_title = event_title.strip(u' ')
400
401 event_description = ''
402 if self.CONF_DL_DESC == 1 :
403 url_desc = xml_ee.getElementsByTagName('linkScheda')[0].firstChild.data
404 event_description = unicode(self.get_description(url_desc.strip(' \n\r'))[:self.CONF_DLDESCMAXCHAR])
405 event_description = event_description.replace('\r','')
406 event_description = event_description.replace('\n',u' ')
407 event_description = event_description.strip(u' ')
408
409 fd.write(event_starttime + self.FIELD_SEPARATOR + event_startime_unix_gmt + self.FIELD_SEPARATOR + event_title + self.FIELD_SEPARATOR + event_description + '\n')
410 num_events += 1
411 self.log2video(" extracting \"%s\" [%d] (%s)" % (channel_name, num_events, day))
412
413
414 fd.close()
415
416 del xmldoc
417
418# ----------------------------------------------------------------------
419
420
421 def process_cache(self):
422 self.log("--- START PROCESSING CACHE ---")
423 self.log2video("START PROCESSING CACHE")
424 if not os.path.exists(self.CONF_CACHEDIR):
425 self.log("ERROR: %s not present" % self.CONF_CACHEDIR,1)
426 sys.exit(1)
427
428 self.log("Loading lamedb")
429 lamedb = scriptlib.lamedb_class()
430
431 self.log("Initialize CrossEPG database")
432 crossdb = scriptlib.crossepg_db_class()
433 crossdb.open_db()
434
435 events = []
436 previous_id = ''
437 channels_name = ''
438 total_events = 0
439
440 self.log("Start data processing")
441 filelist = sorted(os.listdir(self.CONF_CACHEDIR))
442 filelist.append('***END***')
443
444 for f in filelist :
445 id = f.split(self.FIELD_SEPARATOR)[0]
446 if previous_id == '':
447 previous_id = id
448
449 if id != previous_id :
450 total_events += len(events)
451 self.log(" ...processing \'%s\' , nr. events %d" % (previous_id,len(events)))
452 self.log2video("processed %d events ..." % total_events )
453
454 for c in channels_name:
455 # a channel can have zero or more SID (different channel with same name)
456 # return the list [0e1f:00820000:0708:00c8:1:0 , 1d20:00820000:2fa8:013e:1:0 , ..... ]
457 # return [] if channel name is not in lamedb
458 sidbyname = lamedb.get_sid_byname(c.strip(' \n').lower())
459
460 # process every SID
461 for s in sidbyname:
462 # convert "0e1f:00820000:0708:00c8:1:0" to sid,tsid,onid
463 # return the list [sid,tsid,onid]
464 ch_sid = lamedb.convert_sid(s)
465 if len(ch_sid) == 0:
466 continue
467
468 # add channel into db
469 # doesn't matter if the channel already exist... epgdb do all the work
470 crossdb.add_channel(ch_sid)
471
472 i = 0
473 L = len(events) - 1
474
475 # process events
476 for e in events:
477
478 items = e.split(self.FIELD_SEPARATOR)
479 e_starttime = int(items[1])
480
481 if i < L :
482 e_length = int(events[i+1].split(self.FIELD_SEPARATOR)[1]) - e_starttime
483 else:
484 # last event, dummy length 90 min.
485 e_length = 5400
486 i += 1
487
488 # extract title and encode Python Unicode with UTF-8
489 e_title = items[2].encode('utf-8')
490
491 # extract summarie and encode Python Unicode with UTF-8
492 e_summarie = items[3].encode('utf-8')
493
494 # add_event(start_time , duration , title , summarie , ISO639_language_code , strings_encoded_with_UTF-8)
495 crossdb.add_event(e_starttime, e_length, e_title, e_summarie, 'ita', True )
496
497 if f == '***END***':
498 break
499
500 events = []
501 previous_id = id
502 channels_name = ''
503
504 if id == previous_id:
505 self.log("Reading \'%s\'" % f)
506 # read events from cache file using UTF-8 and insert them in events list
507 fd = codecs.open(os.path.join(self.CONF_CACHEDIR, f),"r","utf-8")
508 lines = fd.readlines()
509 fd.close()
510 if channels_name == '':
511 # first line has channel data (id,name,provider,date)
512 channels_name = lines[0].split(self.FIELD_SEPARATOR)[1].split('|')
513 # the second line is only a remark
514 # add events starting from third line
515 events.extend(lines[2:])
516
517 # end process, close CrossEPG DB saving data
518 crossdb.close_db()
519 self.log("TOTAL EPG EVENTS PROCESSED: %d" % total_events)
520 self.log("--- END ---")
521 self.log2video("END , events processed: %d" % total_events)
522
523
524
525# ****************************************************************************************************************************
526
527# MAIN CODE: SCRIPT START HERE
528
529# increase this process niceness (other processes have higher priority)
530os.nice(10)
531
532# set Garbage Collector to do a "generational jump" more frequently than default 700
533# memory saving: about 50% (!!), some performance loss (obviously)
534gc.set_threshold(50,10,10)
535
536SCRIPT_DIR = 'scripts/mediaprem/'
537
538# get CrossEPG installation dir.
539crossepg_instroot = crossepg.epgdb_get_installroot()
540if crossepg_instroot == False:
541 sys.exit(1)
542scriptlocation = os.path.join(crossepg_instroot , SCRIPT_DIR)
543
544# get where CrossEPG save data (dbroot) and use it as script cache repository
545crossepg_dbroot = crossepg.epgdb_get_dbroot()
546if crossepg_dbroot == False:
547 sys.exit(1)
548
549# initialize script class
550script_class = main(scriptlocation , crossepg_dbroot)
551
552# download data and cache them
553script_class.download_and_cache()
554
555# read cached data and inject into CrossEPG database
556script_class.process_cache()
557
Note: See TracBrowser for help on using the repository browser.