Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

mediaprem.py@ 17993

Last change on this file since 17993 was 7451, checked in by BPanther, 15 years ago
[ipk] - copy source->source.sh4
File size: 17.5 KB

Line
1	#!/usr/bin/python
2	# mediaprem.py by Ambrosa http://www.ambrosa.net
3	# this module is used for download EPG data from Mediaset website
4	# derived from E2_LOADEPG
5
6	__author__ = "ambrosa http://www.ambrosa.net"
7	__copyright__ = "Copyright (C) 2008-2011 Alessandro Ambrosini"
8	__license__ = "CreativeCommons by-nc-sa http://creativecommons.org/licenses/by-nc-sa/3.0/"
9
10	import gc
11	import os
12	import sys
13	import time
14	import codecs
15	import socket
16	import urllib
17	import urllib2
18	import ConfigParser
19	#from xml.dom import minidom
20
21	# import CrossEPG functions
22	import crossepg
23
24	# location of local python modules under "scripts/lib" dir.
25	# add it to sys.path()
26	crossepg_instroot = crossepg.epgdb_get_installroot()
27	if crossepg_instroot == False:
28	sys.exit(1)
29	libdir = os.path.join(crossepg_instroot , 'scripts/lib')
30	sys.path.append(libdir)
31
32	# import local modules
33	import sgmllib
34	import scriptlib
35
36	# =================================================================
37	# HTML PARSER used for parsing description
38
39
40	class Description_parser(sgmllib.SGMLParser):
41	def parse(self, s):
42	self.feed(s)
43	self.close()
44
45	def __init__(self, verbose=0):
46	sgmllib.SGMLParser.__init__(self, verbose)
47	self.start_div_box = False
48	self.start_div_boxtxt = False
49	self.description = ''
50
51
52	def start_div(self, attributes):
53	for name, value in attributes:
54	if name == "class":
55	if value == "box_Text":
56	self.start_div_box = True
57	elif value == "txtBox_cms":
58	self.start_div_boxtxt = True
59
60	def end_div(self):
61	if self.start_div_boxtxt == True:
62	self.start_div_box = False
63	self.start_div_boxtxt = False
64
65
66	def handle_data(self, data):
67	if self.start_div_boxtxt == True:
68	self.description += data.decode('iso-8859-1')
69
70	def get_descr(self):
71	return (self.description.strip(' \n\r') )
72
73
74
75	# =================================================================
76
77
78	class main(sgmllib.SGMLParser):
79
80	# main config file
81	CONF_CONFIGFILENAME = "mediaprem.conf"
82
83	# Network socket timeout (in seconds)
84	CONF_SOCKET_TIMEOUT = 20
85
86	# log text
87	CONF_LOG_SCRIPT_NAME = "MediasetPremium (Italy)"
88	CONF_LOG_PREFIX = ""
89
90	# max chars in description
91	CONF_DLDESCMAXCHAR = 250
92
93	# retry number if HTTP error
94	HTTP_ERROR_RETRY = 3
95	# seconds to wait between retries
96	HTTP_ERROR_WAIT_RETRY = 5
97
98	# charset used in remote website epg data
99	REMOTE_EPG_CHARSET = 'utf-8'
100
101	TODAYMP = ''
102	DAYCACHEMP = []
103	FIELD_SEPARATOR = '###'
104	CHANNELLIST = {}
105
106	DESCRIPTIONS_WEBCACHE = {}
107
108
109	# -------- xml processing using SGMLLIB -----------
110	# best way is use xml.minidom but it's very memory hungry (about 40MB memory for 2 MB XML file)
111	# sgmllib can simple parse xml data
112	SGML_PALINSESTO_INSIDE = False
113	SGML_TITOLO_INSIDE = False
114	SGML_LINKSCHEDA_INSIDE = False
115
116	SGML_GIORNOMP = None
117	SGML_CHID = None
118	SGML_FD = None
119	SGML_TOTAL_EVENTS = 0
120
121	SGML_EVENT_STARTHOUR = None
122	SGML_EVENT_TITLE = None
123	SGML_EVENT_SUMMARIE = None
124
125	def parse(self, s):
126	self.feed(s)
127	self.close()
128
129	def start_palinsesto(self, attr):
130	self.SGML_PALINSESTO_INSIDE = True
131
132	def end_palinsesto(self):
133	self.SGML_PALINSESTO_INSIDE = False
134	self.SGML_GIORNOMP = None
135	self.log("extracted %d events" % self.SGML_TOTAL_EVENTS)
136
137	def start_giorno(self,attr):
138	if self.SGML_PALINSESTO_INSIDE == True :
139	self.SGML_GIORNOMP = None
140	for name,value in attr:
141	if name == "data":
142	if str(value).strip(' \n\r') in self.DAYCACHEMP :
143	self.SGML_GIORNOMP = str(value).strip(' \n\r')
144	break
145
146	def end_giorno(self):
147	self.SGML_GIORNOMP = None
148
149	def start_canale(self,attr):
150	if self.SGML_GIORNOMP != None:
151	for name,value in attr:
152	if name == "id":
153	self.SGML_CHID = str(value).strip(' \n\r').lower()
154
155	if not self.CHANNELLIST.has_key(self.SGML_CHID) :
156	self.log("Warning: new channel id=%s found in XML data" % self.SGML_CHID )
157	break
158
159	# get cache option
160	# 0 : don't download/cache
161	# 1 : download and cache (optional 1,new_name )
162	# 2 : always download overwriting existing files (optional 2,new_name )
163	# 3 : always download overwriting existing files only for TODAY (optional 3,new_name )
164
165	cacheopt = int(self.CHANNELLIST[self.SGML_CHID].split(",")[0])
166
167	# if cacheopt == 0, do nothing
168	if cacheopt == 0:
169	break
170
171	channel_name = ''
172	if len(self.CHANNELLIST[self.SGML_CHID].split(",")) > 1 :
173	if self.CHANNELLIST[self.SGML_CHID].split(",")[1] != '' :
174	# channel renamed, new name provided by user
175	channel_name = self.CHANNELLIST[self.SGML_CHID].split(",")[1].strip(' \n\r').lower()
176
177	# if channel name is not present as option, quit with error
178	if channel_name == '':
179	self.log("ERROR ! ID=%s channel name not present" % self.SGML_CHID)
180	sys.exit(1)
181
182	channel_provider = self.CONF_DEFAULT_PROVIDER
183	if len(self.CHANNELLIST[self.SGML_CHID].split(",")) > 2 :
184	if self.CHANNELLIST[self.SGML_CHID].split(",")[2] != '' :
185	channel_provider = self.CHANNELLIST[self.SGML_CHID].split(",")[2].strip(' \n\r').lower()
186
187	# if channel name is not present as option in channel_list.conf , quit with error
188	if channel_name == '':
189	self.log("ERROR ! ID=" + self.SGML_CHID + " channel name not present. Skip !")
190	break
191
192	day = str(self.convert_daymp(self.SGML_GIORNOMP))
193	eventfilename = scriptlib.fn_escape(self.SGML_CHID + self.FIELD_SEPARATOR + channel_name + self.FIELD_SEPARATOR + day)
194	eventfilepath = os.path.join(self.CONF_CACHEDIR, eventfilename)
195	if (cacheopt == 1) and os.path.exists(eventfilepath):
196	break
197	if (cacheopt == 3) and os.path.exists(eventfilepath) and (self.SGML_GIORNOMP != self.TODAYMP):
198	break
199	if (cacheopt != 1) and (cacheopt != 2) and (cacheopt != 3):
200	self.log("Warning: unknown cache option " + str(cacheopt))
201	break
202
203	self.log(" Writing in cache \'" + eventfilename + "\'",2)
204	self.log2video(" extracting \"%s\" (%s)" % (channel_name, day))
205
206	self.SGML_FD = codecs.open(eventfilepath,"w",'utf-8')
207
208	self.SGML_FD.write(self.SGML_CHID + self.FIELD_SEPARATOR + channel_name + self.FIELD_SEPARATOR + channel_provider + self.FIELD_SEPARATOR + day + '\n')
209	self.SGML_FD.write("Local Time (human readeable)###Unix GMT Time###Event Title###Event Description\n")
210
211	break
212
213	def end_canale(self):
214	if self.SGML_FD != None:
215	self.SGML_FD.close()
216	self.SGML_FD = None
217	self.SGML_CHID = None
218
219	def start_prg(self,attr):
220	if self.SGML_FD != None :
221	self.SGML_EVENT_STARTHOUR = None
222	for name,value in attr:
223	if name == "orainizio":
224	self.SGML_EVENT_STARTHOUR = str(value).strip(' \n\r')
225	break
226
227	def end_prg(self):
228	if self.SGML_FD != None :
229
230	if (self.SGML_EVENT_STARTHOUR >='00:00') and (self.SGML_EVENT_STARTHOUR <= '05:59') :
231	nextdayevent = 86400
232	else:
233	nextdayevent = 0
234
235	event_starttime = self.SGML_GIORNOMP + '_' + self.SGML_EVENT_STARTHOUR
236	event_startime_unix_gmt = str(int(time.mktime(time.strptime(event_starttime,"%Y/%m/%d_%H:%M"))) - self.DELTA_UTC + nextdayevent)
237
238	event_title = unicode(self.SGML_EVENT_TITLE)
239	event_title = event_title.replace('\r','')
240	event_title = event_title.replace('\n','')
241	event_title = event_title.strip(u' ')
242
243	event_description = ''
244	if self.CONF_DL_DESC == 1 :
245	event_description = unicode(self.get_description(self.SGML_EVENT_SUMMARIE_LINK.strip(' \n\r'), self.CONF_DLDESCMAXCHAR) )
246	event_description = event_description.replace('\r','')
247	event_description = event_description.replace('\n',u' ')
248	event_description = event_description.strip(u' ')
249
250	self.SGML_FD.write(event_starttime + self.FIELD_SEPARATOR + event_startime_unix_gmt + self.FIELD_SEPARATOR + event_title + self.FIELD_SEPARATOR + event_description + '\n')
251	self.SGML_TOTAL_EVENTS += 1
252
253	def start_titolo(self,attr):
254	if self.SGML_FD != None:
255	self.SGML_TITOLO_INSIDE = True
256
257	def end_titolo(self):
258	if self.SGML_FD != None:
259	self.SGML_TITOLO_INSIDE = False
260
261	def start_linkscheda(self,attr):
262	if self.SGML_FD != None:
263	self.SGML_LINKSCHEDA_INSIDE = True
264
265	def end_linkscheda(self):
266	if self.SGML_FD != None:
267	self.SGML_LINKSCHEDA_INSIDE = False
268
269
270	def handle_data(self, data):
271	if self.SGML_TITOLO_INSIDE == True:
272	self.SGML_EVENT_TITLE = data.encode('utf-8')
273	self.SGML_EVENT_TITLE = self.SGML_EVENT_TITLE.strip(' \n\r')
274
275	if self.SGML_LINKSCHEDA_INSIDE == True:
276	self.SGML_EVENT_SUMMARIE_LINK = data.encode('utf-8')
277	self.SGML_EVENT_SUMMARIE_LINK = self.SGML_EVENT_SUMMARIE_LINK.strip(' \n\r')
278
279
280	# -----------------------------------------------
281
282	def log(self,s,video=0):
283	self.logging.log(self.CONF_LOG_PREFIX + str(s))
284	if video == 1:
285	self.log2video(str(s))
286
287	def log2video(self,s):
288	self.logging.log2video_status(str(s))
289
290	def convert_daymp(self,dmp):
291	daystandard = time.strftime("%Y%m%d",time.strptime(dmp,"%Y/%m/%d"))
292	return daystandard
293
294
295	def get_description(self,url,maxchar=128):
296
297	if url[:7] != 'http://':
298	return('')
299
300	if (url[-5:] != '.html') and (url[-4:] != '.htm') :
301	return('')
302
303	url_hash = hash(url)
304	if self.DESCRIPTIONS_WEBCACHE.has_key(url_hash):
305	self.log(" cached description " + url)
306	return(self.DESCRIPTIONS_WEBCACHE[url_hash])
307
308	self.log(" downloading description " + url )
309	url_enc = str(urllib.quote(url,safe=":/"))
310	try:
311	sock = urllib2.urlopen(url_enc)
312	data = sock.read()
313	except IOError, e:
314	serr = "unknown"
315	if hasattr(e, 'reason'):
316	serr = str(e.reason)
317	elif hasattr(e, 'code'):
318	serr = str(e.code)
319	if hasattr(e, 'msg'):
320	serr += " , " + str(e.msg)
321
322	self.log(" error, reason: " + serr + ". Skip it.")
323	return('')
324
325	else:
326	sock.close()
327	dsparser = Description_parser()
328	dsparser.parse(data)
329	self.DESCRIPTIONS_WEBCACHE[url_hash] = dsparser.get_descr()[:maxchar]
330	return(self.DESCRIPTIONS_WEBCACHE[url_hash])
331
332	return('')
333
334
335
336	def __init__(self, confdir, dbroot):
337
338	# initialize SGMLLIB
339	sgmllib.SGMLParser.__init__(self, 0)
340
341	# initialize logging
342	self.logging = scriptlib.logging_class()
343	# write to video OSD the script name
344	self.logging.log2video_scriptname(self.CONF_LOG_SCRIPT_NAME)
345
346	CONF_FILE = os.path.join(confdir,self.CONF_CONFIGFILENAME)
347	if not os.path.exists(CONF_FILE) :
348	self.log("ERROR: %s not present" % CONF_FILE,1)
349	sys.exit(1)
350
351	config = ConfigParser.ConfigParser()
352	#config.optionxform = str # needed to return case sensitive index
353	config.read(CONF_FILE)
354
355	# reading [global] section options
356	self.CONF_DEFAULT_PROVIDER = config.get("global","DEFAULT_PROVIDER")
357	# save cache under dbroot
358	self.CONF_CACHEDIR = os.path.join(dbroot,config.get("global","CACHE_DIRNAME"))
359
360	self.CONF_DL_DESC = config.getint("global","DL_DESC")
361	self.CONF_MAX_DAY_EPG = config.getint("global","MAX_DAY_EPG")
362	self.CONF_URL = config.get("global","URL")
363
364	self.CONF_GMT_ZONE = config.get("global","GMT_ZONE")
365	if self.CONF_GMT_ZONE.strip(' ').lower() == 'equal':
366	#self.DELTA_UTC = -scriptlib.delta_utc() # return negative if timezone is east of GMT (like Italy), invert sign
367	self.DELTA_UTC = 0
368	else:
369	self.DELTA_UTC = float(self.CONF_GMT_ZONE)*3600.0
370	if self.DELTA_UTC >= 0:
371	self.DELTA_UTC = self.DELTA_UTC + scriptlib.delta_dst()
372	else:
373	self.DELTA_UTC = self.DELTA_UTC - scriptlib.delta_dst()
374
375	self.DELTA_UTC = int(self.DELTA_UTC)
376	#self.log("Website timezone - UTC = %d seconds" % self.DELTA_UTC)
377
378	if not os.path.exists(self.CONF_CACHEDIR):
379	self.log("Creating \'%s\' directory for caching" % self.CONF_CACHEDIR)
380	os.mkdir(self.CONF_CACHEDIR)
381
382	# reading [channels] section
383	temp = config.items("channels");
384
385	# create a dictionary (Python array) with index = channel ID
386	for i in temp:
387	self.CHANNELLIST[i[0].strip(' \n\r').lower()] = unicode(i[1].strip(' \n\r').lower(),'utf-8')
388
389	if len(self.CHANNELLIST) == 0 :
390	self.log("ERROR: [channels] section empty ?",1)
391	sys.exit(1)
392
393	# set network socket timeout
394	socket.setdefaulttimeout(self.CONF_SOCKET_TIMEOUT)
395
396	self.TODAYMP = time.strftime("%Y/%m/%d")
397	# create a list filled with dates (format AAAA/MM/DD) from today to today+ MAX_DAY_EPG
398	self.DAYCACHEMP=[]
399	for day in range(0,self.CONF_MAX_DAY_EPG):
400	self.DAYCACHEMP.append(time.strftime("%Y/%m/%d",time.localtime(time.time()+86400*day)))
401
402
403
404	# ----------------------------------------------------------------------
405
406
407	def download_and_cache(self):
408	self.log("--- START DOWNLOAD AND CACHE DATA ---")
409	self.log2video("STARTING DOWNLOAD")
410
411	self.log("Removing old cached files")
412	scriptlib.cleanup_oldcachedfiles(self.CONF_CACHEDIR, self.FIELD_SEPARATOR)
413
414
415	self.log("Start download XML data from \'" + self.CONF_URL+"\'")
416	self.log2video("downloading XML data ...")
417
418	i = self.HTTP_ERROR_RETRY
419	while i > 0:
420	try:
421	sock = urllib2.urlopen(self.CONF_URL)
422	data = sock.read()
423	except IOError, e:
424	serr = "unknown"
425	if hasattr(e, 'reason'):
426	serr = str(e.reason)
427	elif hasattr(e, 'code'):
428	serr = str(e.code)
429	if hasattr(e, 'msg'):
430	serr += " , " + str(e.msg)
431
432	self.log("\'" + self.CONF_URL + "\' connection error. Reason: "+serr+". Waiting "+str(self.HTTP_ERROR_WAIT_RETRY)+" sec. and retry ["+str(i)+"] ...")
433	time.sleep(self.HTTP_ERROR_WAIT_RETRY) # add sleep
434	i -= 1
435
436	else:
437	i = -99
438	sock.close()
439
440	if (i != -99):
441	self.log("Cannot retrieve data from \'" + self.CONF_URL + "\'. Abort script")
442	self.log2video("Error: cannot download XML data, abort")
443	time.sleep(5)
444	sys.exit(1)
445
446	self.log("end download XML data, now processing")
447	self.log2video("processing XML data, wait ...")
448
449	# start SGMLLIB parsing
450	self.parse(data)
451
452	self.log("end process XML data",1)
453
454	# ----------------------------------------------------------------------
455
456
457	def process_cache(self):
458	self.log("--- START PROCESSING CACHE ---")
459	self.log2video("START PROCESSING CACHE")
460	if not os.path.exists(self.CONF_CACHEDIR):
461	self.log("ERROR: %s not present" % self.CONF_CACHEDIR,1)
462	sys.exit(1)
463
464	self.log("Loading lamedb")
465	lamedb = scriptlib.lamedb_class()
466
467	self.log("Initialize CrossEPG database")
468	crossdb = scriptlib.crossepg_db_class()
469	crossdb.open_db()
470
471	events = []
472	previous_id = ''
473	channels_name = ''
474	total_events = 0
475
476	self.log("Start data processing")
477	filelist = sorted(os.listdir(self.CONF_CACHEDIR))
478	filelist.append('*END*')
479
480	for f in filelist :
481	id = f.split(self.FIELD_SEPARATOR)[0]
482	if previous_id == '':
483	previous_id = id
484
485	if id != previous_id :
486	total_events += len(events)
487	self.log(" ...processing \'%s\' , nr. events %d" % (previous_id,len(events)))
488	self.log2video("processed %d events ..." % total_events )
489
490	for c in channels_name:
491	# a channel can have zero or more SID (different channel with same name)
492	# return the list [0e1f:00820000:0708:00c8:1:0 , 1d20:00820000:2fa8:013e:1:0 , ..... ]
493	# return [] if channel name is not in lamedb
494	sidbyname = lamedb.get_sid_byname(c.strip(' \n').lower())
495
496	# process every SID
497	for s in sidbyname:
498	# convert "0e1f:00820000:0708:00c8:1:0" to sid,tsid,onid
499	# return the list [sid,tsid,onid]
500	ch_sid = lamedb.convert_sid(s)
501	if len(ch_sid) == 0:
502	continue
503
504	# add channel into db
505	# doesn't matter if the channel already exist... epgdb do all the work
506	crossdb.add_channel(ch_sid)
507
508	i = 0
509	L = len(events) - 1
510
511	# process events
512	for e in events:
513
514	items = e.split(self.FIELD_SEPARATOR)
515	e_starttime = int(items[1])
516
517	if i < L :
518	e_length = int(events[i+1].split(self.FIELD_SEPARATOR)[1]) - e_starttime
519	else:
520	# last event, dummy length 90 min.
521	e_length = 5400
522	i += 1
523
524	# extract title and encode Python Unicode with UTF-8
525	e_title = items[2].encode('utf-8')
526
527	# extract summarie and encode Python Unicode with UTF-8
528	e_summarie = items[3].encode('utf-8')
529
530	# add_event(start_time , duration , title , summarie , ISO639_language_code , strings_encoded_with_UTF-8)
531	crossdb.add_event(e_starttime, e_length, e_title, e_summarie, 'ita', True )
532
533	if f == '*END*':
534	break
535
536	events = []
537	previous_id = id
538	channels_name = ''
539
540	if id == previous_id:
541	self.log("Reading \'%s\'" % f)
542	# read events from cache file using UTF-8 and insert them in events list
543	fd = codecs.open(os.path.join(self.CONF_CACHEDIR, f),"r","utf-8")
544	lines = fd.readlines()
545	fd.close()
546	if channels_name == '':
547	# first line has channel data (id,name,provider,date)
548	channels_name = lines[0].split(self.FIELD_SEPARATOR)[1].split('\|')
549	# the second line is only a remark
550	# add events starting from third line
551	events.extend(lines[2:])
552
553	# end process, close CrossEPG DB saving data
554	crossdb.close_db()
555	self.log("TOTAL EPG EVENTS PROCESSED: %d" % total_events)
556	self.log("--- END ---")
557	self.log2video("END , events processed: %d" % total_events)
558
559
560
561	# ****************************************************************************************************************************
562
563	# MAIN CODE: SCRIPT START HERE
564
565	# increase this process niceness (other processes have higher priority)
566	os.nice(10)
567
568	# set Garbage Collector to do a "generational jump" more frequently than default 700
569	# memory saving: about 50% (!!), some performance loss (obviously)
570	gc.set_threshold(50,10,10)
571
572	SCRIPT_DIR = 'scripts/mediaprem/'
573
574	# get CrossEPG installation dir.
575	crossepg_instroot = crossepg.epgdb_get_installroot()
576	if crossepg_instroot == False:
577	sys.exit(1)
578	scriptlocation = os.path.join(crossepg_instroot , SCRIPT_DIR)
579
580	# get where CrossEPG save data (dbroot) and use it as script cache repository
581	crossepg_dbroot = crossepg.epgdb_get_dbroot()
582	if crossepg_dbroot == False:
583	sys.exit(1)
584
585	# initialize script class
586	script_class = main(scriptlocation , crossepg_dbroot)
587
588	# download data and cache them
589	script_class.download_and_cache()
590
591	# read cached data and inject into CrossEPG database
592	script_class.process_cache()
593

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: ipk/source/epg_crossepg/var/crossepg/scripts/mediaprem/mediaprem.py@ 17993

Download in other formats: