Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

example-mediaprem-minidom.py@ 19259

Last change on this file since 19259 was 7451, checked in by BPanther, 15 years ago
[ipk] - copy source->source.sh4
File size: 16.5 KB

Line
1	#!/usr/bin/python
2	# mediaprem.py by Ambrosa http://www.ambrosa.net
3	# this module is used for download EPG data from Mediaset website
4	# derived from E2_LOADEPG
5
6	__author__ = "ambrosa http://www.ambrosa.net"
7	__copyright__ = "Copyright (C) 2008-2011 Alessandro Ambrosini"
8	__license__ = "CreativeCommons by-nc-sa http://creativecommons.org/licenses/by-nc-sa/3.0/"
9
10	import gc
11	import os
12	import sys
13	import time
14	import codecs
15	import socket
16	import urllib
17	import urllib2
18	import ConfigParser
19	from xml.dom import minidom
20
21	# import CrossEPG functions
22	import crossepg
23
24	# location of local python modules under "scripts/lib" dir.
25	# add it to sys.path()
26	crossepg_instroot = crossepg.epgdb_get_installroot()
27	if crossepg_instroot == False:
28	sys.exit(1)
29	libdir = os.path.join(crossepg_instroot , 'scripts/lib')
30	sys.path.append(libdir)
31
32	# import local modules
33	import sgmllib
34	import scriptlib
35
36	# =================================================================
37	# HTML PARSER
38
39
40	class Description_parser(sgmllib.SGMLParser):
41	def parse(self, s):
42	self.feed(s)
43	self.close()
44
45	def __init__(self, verbose=0):
46	sgmllib.SGMLParser.__init__(self, verbose)
47	self.start_div_box = False
48	self.start_div_boxtxt = False
49	self.description = ''
50
51
52	def start_div(self, attributes):
53	for name, value in attributes:
54	if name == "class":
55	if value == "box_Text":
56	self.start_div_box = True
57	elif value == "txtBox_cms":
58	self.start_div_boxtxt = True
59
60	def end_div(self):
61	if self.start_div_boxtxt == True:
62	self.start_div_box = False
63	self.start_div_boxtxt = False
64
65
66	def handle_data(self, data):
67	if self.start_div_boxtxt == True:
68	self.description += data.decode('iso-8859-1')
69
70	def get_descr(self):
71	return (self.description.strip(' \n\r') )
72
73	# =================================================================
74
75
76	class main:
77
78	# main config file
79	CONF_CONFIGFILENAME = "mediaprem.conf"
80
81	# Network socket timeout (in seconds)
82	CONF_SOCKET_TIMEOUT = 20
83
84	# log text
85	CONF_LOG_SCRIPT_NAME = "MediasetPremium (Italy)"
86	CONF_LOG_PREFIX = ""
87
88	# max chars in description
89	CONF_DLDESCMAXCHAR = 250
90
91	# retry number if HTTP error
92	HTTP_ERROR_RETRY = 3
93	# seconds to wait between retries
94	HTTP_ERROR_WAIT_RETRY = 5
95
96	# charset used in remote website epg data
97	REMOTE_EPG_CHARSET = 'utf-8'
98
99	TODAYMP = ''
100	DAYCACHEMP = []
101	FIELD_SEPARATOR = '###'
102	CHANNELLIST = {}
103
104
105	def log(self,s,video=0):
106	self.logging.log(self.CONF_LOG_PREFIX + str(s))
107	if video == 1:
108	self.log2video(str(s))
109
110	def log2video(self,s):
111	self.logging.log2video_status(str(s))
112
113	def convert_daymp(self,dmp):
114	daystandard = time.strftime("%Y%m%d",time.strptime(dmp,"%Y/%m/%d"))
115	return daystandard
116
117
118	def get_description(self,url):
119
120	if url[:7] != 'http://':
121	return('')
122
123	if (url[-5:] != '.html') and (url[-4:] != '.htm') :
124	return('')
125
126	self.log(" downloading description \'" + url + "\'")
127	url = str(urllib.quote(url,safe=":/"))
128
129	try:
130	sock = urllib2.urlopen(url)
131	data = sock.read()
132	except IOError, e:
133	serr = "unknown"
134	if hasattr(e, 'reason'):
135	serr = str(e.reason)
136	elif hasattr(e, 'code'):
137	serr = str(e.code)
138	if hasattr(e, 'msg'):
139	serr += " , " + str(e.msg)
140
141	self.log(url + " error, reason: " + serr + ". Skip it.")
142	return('')
143
144	else:
145	sock.close()
146	dsparser = Description_parser()
147	dsparser.parse(data)
148	return(dsparser.get_descr())
149
150	return('')
151
152
153
154	def __init__(self,confdir,dbroot):
155
156	# initialize logging
157	self.logging = scriptlib.logging_class()
158	# write to video OSD the script name
159	self.logging.log2video_scriptname(self.CONF_LOG_SCRIPT_NAME)
160
161
162	# check swap memory available
163	osp = os.popen('free \| awk \'/Swap/ { print $2 }\'','r')
164	ret = osp.readlines()
165	if len(ret) > 0:
166	try:
167	m = int(ret[0])/1024
168	except:
169	self.log("Error get SWAP value, abort",1)
170	time.sleep(10)
171	sys.exit(1)
172
173	if m < 60:
174	self.log("SWAP Not Enabled (<60MB), abort",1)
175	time.sleep(10)
176	sys.exit(1)
177	else:
178	self.log("Error get SWAP value, abort",1)
179	time.sleep(10)
180	sys.exit(1)
181
182	osp.close()
183
184
185	CONF_FILE = os.path.join(confdir,self.CONF_CONFIGFILENAME)
186	if not os.path.exists(CONF_FILE) :
187	self.log("ERROR: %s not present" % CONF_FILE,1)
188	sys.exit(1)
189
190	config = ConfigParser.ConfigParser()
191	#config.optionxform = str # needed to return case sensitive index
192	config.read(CONF_FILE)
193
194	# reading [global] section options
195	self.CONF_DEFAULT_PROVIDER = config.get("global","DEFAULT_PROVIDER")
196	# save cache under dbroot
197	self.CONF_CACHEDIR = os.path.join(dbroot,config.get("global","CACHE_DIRNAME"))
198
199	self.CONF_DL_DESC = config.getint("global","DL_DESC")
200	self.CONF_MAX_DAY_EPG = config.getint("global","MAX_DAY_EPG")
201	self.CONF_URL = config.get("global","URL")
202
203	self.CONF_GMT_ZONE = config.get("global","GMT_ZONE")
204	if self.CONF_GMT_ZONE.strip(' ').lower() == 'equal':
205	#self.DELTA_UTC = -scriptlib.delta_utc() # return negative if timezone is east of GMT (like Italy), invert sign
206	self.DELTA_UTC = 0
207	else:
208	self.DELTA_UTC = float(self.CONF_GMT_ZONE)*3600.0
209	if self.DELTA_UTC >= 0:
210	self.DELTA_UTC = self.DELTA_UTC + scriptlib.delta_dst()
211	else:
212	self.DELTA_UTC = self.DELTA_UTC - scriptlib.delta_dst()
213
214	self.DELTA_UTC = int(self.DELTA_UTC)
215	#self.log("Website timezone - UTC = %d seconds" % self.DELTA_UTC)
216
217	if not os.path.exists(self.CONF_CACHEDIR):
218	self.log("Creating \'%s\' directory for caching" % self.CONF_CACHEDIR)
219	os.mkdir(self.CONF_CACHEDIR)
220
221	# reading [channels] section
222	temp = config.items("channels");
223
224	# create a dictionary (Python array) with index = channel ID
225	for i in temp:
226	self.CHANNELLIST[i[0].strip(' \n\r').lower()] = unicode(i[1].strip(' \n\r').lower(),'utf-8')
227
228	if len(self.CHANNELLIST) == 0 :
229	self.log("ERROR: [channels] section empty ?",1)
230	sys.exit(1)
231
232	# set network socket timeout
233	socket.setdefaulttimeout(self.CONF_SOCKET_TIMEOUT)
234
235	self.TODAYMP = time.strftime("%Y/%m/%d")
236	# create a list filled with dates (format AAAA/MM/DD) from today to today+ MAX_DAY_EPG
237	self.DAYCACHEMP=[self.TODAYMP]
238	for day in range(1,self.CONF_MAX_DAY_EPG):
239	self.DAYCACHEMP.append(time.strftime("%Y/%m/%d",time.localtime(time.time()+86400*day)))
240
241
242
243	# ----------------------------------------------------------------------
244
245
246	def download_and_cache(self):
247	self.log("--- START DOWNLOAD AND CACHE DATA ---")
248	self.log2video("STARTING DOWNLOAD")
249
250	self.log("Removing old cached files")
251	scriptlib.cleanup_oldcachedfiles(self.CONF_CACHEDIR, self.FIELD_SEPARATOR)
252
253	chlist = self.CHANNELLIST
254
255	self.log("Start download XML data from \'" + self.CONF_URL+"\'")
256	self.log2video("downloading XML data ...")
257
258	i = self.HTTP_ERROR_RETRY
259	while i > 0:
260	try:
261	sock = urllib2.urlopen(self.CONF_URL)
262	data = sock.read()
263	except IOError, e:
264	serr = "unknown"
265	if hasattr(e, 'reason'):
266	serr = str(e.reason)
267	elif hasattr(e, 'code'):
268	serr = str(e.code)
269	if hasattr(e, 'msg'):
270	serr += " , " + str(e.msg)
271
272	self.log("\'" + self.CONF_URL + "\' connection error. Reason: "+serr+". Waiting "+str(self.HTTP_ERROR_WAIT_RETRY)+" sec. and retry ["+str(i)+"] ...")
273	time.sleep(self.HTTP_ERROR_WAIT_RETRY) # add sleep
274	i -= 1
275
276	else:
277	i = -99
278	sock.close()
279
280	if (i != -99):
281	self.log("Cannot retrieve data from \'" + self.CONF_URL + "\'. Abort script")
282	self.log2video("Error: cannot download XML data, abort")
283	time.sleep(5)
284	sys.exit(1)
285
286	self.log("End download XML data, now processing XML code.")
287	self.log2video("preprocessing XML data, wait ...")
288	try:
289	xmldoc = minidom.parseString(data)
290	except:
291	self.log("Warning ! Data are not in a valid XML format. Abort script")
292	self.log2video("Error: no valid XML data, abort")
293	time.sleep(5)
294	sys.exit(1)
295
296
297	self.log("End process XML data")
298	self.log2video("end process XML data")
299
300	# days list
301	xmlref_giorno = xmldoc.getElementsByTagName('giorno')
302	for xml_gg in xmlref_giorno:
303	gg = xml_gg.attributes["data"].value
304	if gg not in self.DAYCACHEMP :
305	continue
306
307	xmlref_canale = xml_gg.getElementsByTagName('canale')
308	for xml_ch in xmlref_canale:
309	chid = xml_ch.attributes["id"].value.strip(' \n\r').lower()
310	if not chlist.has_key(chid) :
311	self.log("Warning: new channel \"id=%s name=%s\" found in XML data" % (xml_ch.attributes["id"].value,xml_ch.attributes["description"]))
312	continue
313
314	clist = [chid]
315	if self.CHANNELLIST.has_key(chid + '+1'):
316	clist.append(chid + '+1')
317
318	for c in clist:
319
320	# get cache option
321	# 0 : don't download/cache
322	# 1 : download and cache (optional 1,new_name )
323	# 2 : always download overwriting existing files (optional 2,new_name )
324	# 3 : always download overwriting existing files only for TODAY (optional 3,new_name )
325
326	cacheopt = int(chlist[c].split(",")[0])
327
328	# if cacheopt == 0, do nothing
329	if cacheopt == 0:
330	continue
331
332	channel_name = ''
333	if len(chlist[c].split(",")) > 1 :
334	if chlist[c].split(",")[1] != '' :
335	# channel renamed, new name provided by user
336	channel_name = chlist[c].split(",")[1].strip(' \n\r').lower()
337
338	# if channel name is not present as option, quit with error
339	if channel_name == '':
340	self.log("ERROR ! ID=%s channel name not present" % c)
341	sys.exit(1)
342
343	channel_provider = self.CONF_DEFAULT_PROVIDER
344	if len(chlist[c].split(",")) > 2 :
345	if chlist[c].split(",")[2] != '' :
346	channel_provider = chlist[c].split(",")[2].strip(' \n\r').lower()
347
348	# if channel name is not present as option in channel_list.conf , quit with error
349	if channel_name == '':
350	self.log("ERROR ! ID=" + str(c) + " channel name not present. Skip !")
351	continue
352
353	# download only if file doesn't exist or cacheopt == 2 (always download),
354	# using open(...,"w") files will be overwritten (saving a delete + create)
355
356	day = str(self.convert_daymp(gg))
357	eventfilename = scriptlib.fn_escape(str(c) + self.FIELD_SEPARATOR + channel_name + self.FIELD_SEPARATOR + day)
358	eventfilepath = os.path.join(self.CONF_CACHEDIR, eventfilename)
359	if (cacheopt == 1) and os.path.exists(eventfilepath):
360	continue
361	if (cacheopt == 3) and os.path.exists(eventfilepath) and (gg != self.TODAYMP):
362	continue
363	if (cacheopt != 1) and (cacheopt != 2) and (cacheopt != 3):
364	self.log("Warning: unknown cache option " + str(cacheopt))
365	exit_for_loop = True
366	continue
367
368	num_events = 0
369	self.log(" Writing in cache \'" + eventfilename + "\'",2)
370	self.log2video(" extracting \"%s\" [%d] (%s)" % (channel_name, num_events, day))
371
372	fd=codecs.open(eventfilepath,"w",'utf-8')
373
374	fd.write(str(c) + self.FIELD_SEPARATOR + channel_name + self.FIELD_SEPARATOR + channel_provider + self.FIELD_SEPARATOR + day + '\n')
375	fd.write("Local Time (human readeable)###Unix GMT Time###Event Title###Event Description\n")
376
377	xmlref_events = xml_ch.getElementsByTagName('prg')
378	for xml_ee in xmlref_events:
379	orainiz = xml_ee.attributes["orainizio"].value
380
381	if (orainiz >='00:00') and (orainiz <= '05:59') :
382	nextdayevent = 86400
383	else:
384	nextdayevent = 0
385
386	event_starttime = gg + " " + orainiz
387
388	if c == (chid + '+1'):
389	# manage channel "+1"
390	event_startime_unix_gmt = str(int(time.mktime(time.strptime(event_starttime,"%Y/%m/%d %H:%M"))) - self.DELTA_UTC + 3600 + nextdayevent)
391	else:
392	# normal channel, not "+1"
393	event_startime_unix_gmt = str(int(time.mktime(time.strptime(event_starttime,"%Y/%m/%d %H:%M"))) - self.DELTA_UTC + nextdayevent)
394
395
396	event_title = unicode(xml_ee.getElementsByTagName('titolo')[0].firstChild.data)
397	event_title = event_title.replace('\r','')
398	event_title = event_title.replace('\n','')
399	event_title = event_title.strip(u' ')
400
401	event_description = ''
402	if self.CONF_DL_DESC == 1 :
403	url_desc = xml_ee.getElementsByTagName('linkScheda')[0].firstChild.data
404	event_description = unicode(self.get_description(url_desc.strip(' \n\r'))[:self.CONF_DLDESCMAXCHAR])
405	event_description = event_description.replace('\r','')
406	event_description = event_description.replace('\n',u' ')
407	event_description = event_description.strip(u' ')
408
409	fd.write(event_starttime + self.FIELD_SEPARATOR + event_startime_unix_gmt + self.FIELD_SEPARATOR + event_title + self.FIELD_SEPARATOR + event_description + '\n')
410	num_events += 1
411	self.log2video(" extracting \"%s\" [%d] (%s)" % (channel_name, num_events, day))
412
413
414	fd.close()
415
416	del xmldoc
417
418	# ----------------------------------------------------------------------
419
420
421	def process_cache(self):
422	self.log("--- START PROCESSING CACHE ---")
423	self.log2video("START PROCESSING CACHE")
424	if not os.path.exists(self.CONF_CACHEDIR):
425	self.log("ERROR: %s not present" % self.CONF_CACHEDIR,1)
426	sys.exit(1)
427
428	self.log("Loading lamedb")
429	lamedb = scriptlib.lamedb_class()
430
431	self.log("Initialize CrossEPG database")
432	crossdb = scriptlib.crossepg_db_class()
433	crossdb.open_db()
434
435	events = []
436	previous_id = ''
437	channels_name = ''
438	total_events = 0
439
440	self.log("Start data processing")
441	filelist = sorted(os.listdir(self.CONF_CACHEDIR))
442	filelist.append('*END*')
443
444	for f in filelist :
445	id = f.split(self.FIELD_SEPARATOR)[0]
446	if previous_id == '':
447	previous_id = id
448
449	if id != previous_id :
450	total_events += len(events)
451	self.log(" ...processing \'%s\' , nr. events %d" % (previous_id,len(events)))
452	self.log2video("processed %d events ..." % total_events )
453
454	for c in channels_name:
455	# a channel can have zero or more SID (different channel with same name)
456	# return the list [0e1f:00820000:0708:00c8:1:0 , 1d20:00820000:2fa8:013e:1:0 , ..... ]
457	# return [] if channel name is not in lamedb
458	sidbyname = lamedb.get_sid_byname(c.strip(' \n').lower())
459
460	# process every SID
461	for s in sidbyname:
462	# convert "0e1f:00820000:0708:00c8:1:0" to sid,tsid,onid
463	# return the list [sid,tsid,onid]
464	ch_sid = lamedb.convert_sid(s)
465	if len(ch_sid) == 0:
466	continue
467
468	# add channel into db
469	# doesn't matter if the channel already exist... epgdb do all the work
470	crossdb.add_channel(ch_sid)
471
472	i = 0
473	L = len(events) - 1
474
475	# process events
476	for e in events:
477
478	items = e.split(self.FIELD_SEPARATOR)
479	e_starttime = int(items[1])
480
481	if i < L :
482	e_length = int(events[i+1].split(self.FIELD_SEPARATOR)[1]) - e_starttime
483	else:
484	# last event, dummy length 90 min.
485	e_length = 5400
486	i += 1
487
488	# extract title and encode Python Unicode with UTF-8
489	e_title = items[2].encode('utf-8')
490
491	# extract summarie and encode Python Unicode with UTF-8
492	e_summarie = items[3].encode('utf-8')
493
494	# add_event(start_time , duration , title , summarie , ISO639_language_code , strings_encoded_with_UTF-8)
495	crossdb.add_event(e_starttime, e_length, e_title, e_summarie, 'ita', True )
496
497	if f == '*END*':
498	break
499
500	events = []
501	previous_id = id
502	channels_name = ''
503
504	if id == previous_id:
505	self.log("Reading \'%s\'" % f)
506	# read events from cache file using UTF-8 and insert them in events list
507	fd = codecs.open(os.path.join(self.CONF_CACHEDIR, f),"r","utf-8")
508	lines = fd.readlines()
509	fd.close()
510	if channels_name == '':
511	# first line has channel data (id,name,provider,date)
512	channels_name = lines[0].split(self.FIELD_SEPARATOR)[1].split('\|')
513	# the second line is only a remark
514	# add events starting from third line
515	events.extend(lines[2:])
516
517	# end process, close CrossEPG DB saving data
518	crossdb.close_db()
519	self.log("TOTAL EPG EVENTS PROCESSED: %d" % total_events)
520	self.log("--- END ---")
521	self.log2video("END , events processed: %d" % total_events)
522
523
524
525	# ****************************************************************************************************************************
526
527	# MAIN CODE: SCRIPT START HERE
528
529	# increase this process niceness (other processes have higher priority)
530	os.nice(10)
531
532	# set Garbage Collector to do a "generational jump" more frequently than default 700
533	# memory saving: about 50% (!!), some performance loss (obviously)
534	gc.set_threshold(50,10,10)
535
536	SCRIPT_DIR = 'scripts/mediaprem/'
537
538	# get CrossEPG installation dir.
539	crossepg_instroot = crossepg.epgdb_get_installroot()
540	if crossepg_instroot == False:
541	sys.exit(1)
542	scriptlocation = os.path.join(crossepg_instroot , SCRIPT_DIR)
543
544	# get where CrossEPG save data (dbroot) and use it as script cache repository
545	crossepg_dbroot = crossepg.epgdb_get_dbroot()
546	if crossepg_dbroot == False:
547	sys.exit(1)
548
549	# initialize script class
550	script_class = main(scriptlocation , crossepg_dbroot)
551
552	# download data and cache them
553	script_class.download_and_cache()
554
555	# read cached data and inject into CrossEPG database
556	script_class.process_cache()
557

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: ipk/source/epg_crossepg/var/crossepg/scripts/mediaprem/example-mediaprem-minidom.py@ 19259

Download in other formats: