[mythtv-users] New Australian XMLTV grabber

Eyal Lebedinsky eyal at eyal.emu.id.au
Thu Nov 4 23:58:15 UTC 2004


Paul Andreassen wrote:
> Hi again,
> 
> Here is another version which performs must better with the use of a
> queue.  Ten threads make it much faster.
> 
> I left the 'print' in too see if it actually occurs and it doesn't.  The
> problem was the 'getstore' on the line above would return without a
> file.  I simple retry until a file is downloaded and not more problems
> opening the file.

Here is a version where the selection of location is a bit easier.
Hope this saves some Australian fingers.

Other lesser locations (Sydney, Melbourne. etc.) should add their stuff.

-- 
Eyal Lebedinsky (eyal at eyal.emu.id.au) <http://samba.org/eyal/>
-------------- next part --------------
#!/usr/bin/perl -w

# Australian TV Guide XMLTV grabber by Damon Searle
# Derived from a yahoo XMLTV grabber by Ron Kellam which was itself...
# Derived from original code by Justin Hawkins
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

# 30 Oct 2004
#  Damon Searle <djsearle at netspace.net.au>
#  - wrote first version
#  - gets data from NineMSN as a backup. Its not that fancy,
# 31 Oct 2004
#  Fred Donelly <fdonelly at hotmail.com> 
#  - added an option so that the output file can be specified on the 
#    command line and from the quick test I gave it, it now works with 
#    mythfilldatabase.
#  - $offset set to +1000 at the top and then had "+1000" set in a 
#  output string further down rather than the variable
# 4 Nov 2004
#  Paul Andreassen <paulx at andreassen.com.au>
#  - learned some perl and now wants to go back to python
#  - added and then reduced status info
#  - retry on failure to getstore
#  - changed cache to '/var/local/tv_grab_au'
#  - added threading for each day
# 5 Nov 2004
#  - improved threading with use of queue
#  Eyal Lebedinsky <eyal at eyal.emu.id.au>
#  - easier location selection


# *** Only tested with Queensland and ACT data ***

# Instructions:
# Go to http://tvguide.ninemsn.com.au/guide/ select your area
# Look at the last number in the URL before ".asp" and set
# the region variable below. Then put the channel names as listed
# on the tv guide site into the variables below.
# Then set your XMLTV ids from the database in the _XMLTVID variables.

# If it doesn't work with mythfilldatabase, try:
# ./tv_grab_au
# mythfilldatabase --file 1 -1 /var/local/tv_grab_au/guide.xml

use strict;
use Getopt::Long;
use XMLTV;
use LWP::Simple;
use Date::Manip;
use File::Path;
use threads;
use Thread::Queue;

# pick your region
#
  my $location = "Canberra";
# my $location = "Brisbane";

# pick your source
#
  my $source = "free";
# my $source = "freesd";
# my $source = "freehd";

# choose the XMLID URL suffix that mythtv knows
#
  my $XMLTVID_URL = "d1.com.au";


# Variables

my $days_to_grab = 7;
my $guide_url = "http://tvguide.ninemsn.com.au/guide/";
my $details_url = "http://tvguide.ninemsn.com.au/closeup/default.asp?pid=";
my $cache_dir = "/var/local/tv_grab_au";
my $retrys = 5;
my $threads = 10;

my $XMLTV_prefix = $source . "." . $location . ".";
my $XMLTV_suffix = "." . $XMLTVID_URL;

my $region;
my $offset;
my $ABC;
my $Prime;
my $SBS;
my $Ten;
my $WIN;
my $ABC_XMLTVID;
my $Prime_XMLTVID;
my $SBS_XMLTVID;
my $Ten_XMLTVID;
my $WIN_XMLTVID;

if ("Canberra" eq $location) {
	$region = "126";
	$offset = "+1100";
	$ABC = "ABC NSW";
	$Prime = "Prime Southern"; #Channel 7 in Sydney/Melbourne/etc
	$SBS = "SBS Sydney";
	$Ten = "Southern Cross TEN Capital";
	$WIN = "WIN Television NSW"; #Channel 9 in Sydney/Melbourne/etc

	if ("free" eq $source) {
		$ABC_XMLTVID   = $XMLTV_prefix . "2"     . $XMLTV_suffix;
		$Prime_XMLTVID = $XMLTV_prefix . "PrimS" . $XMLTV_suffix;
		$SBS_XMLTVID   = $XMLTV_prefix . "SBS"   . $XMLTV_suffix;
		$Ten_XMLTVID   = $XMLTV_prefix . "10Cap" . $XMLTV_suffix;
		$WIN_XMLTVID   = $XMLTV_prefix . "WIN"   . $XMLTV_suffix;
	} elsif ("freesd" eq $source) {
		$ABC_XMLTVID   = $XMLTV_prefix . "2"   . $XMLTV_suffix;
		$Prime_XMLTVID = $XMLTV_prefix . "7"   . $XMLTV_suffix;
		$SBS_XMLTVID   = $XMLTV_prefix . "SBS" . $XMLTV_suffix;
		$Ten_XMLTVID   = $XMLTV_prefix . "10"  . $XMLTV_suffix;
		$WIN_XMLTVID   = $XMLTV_prefix . "9"   . $XMLTV_suffix;
	} else {
		print "Unknows source '$source' for $location\n";
		exit (1);
	}
} elsif ("Brisbane" eq $location) {
	$region = "79";
	$offset = "+1000";

	$ABC = "ABC QLD"; # "ABC NSW";
	$Prime = "Channel Seven Queensland"; # "Prime Southern"; #Channel 7 in Sydney/Melbourne/etc
	$SBS = "SBS Queensland";  #"SBS Sydney";
	$Ten = "Southern Cross TEN Queensland"; # "Southern Cross TEN Capital";
	$WIN = "WIN Television QLD";# "WIN Television NSW"; #Channel 9 in Sydney/Melbourne/etc

	if ("free" eq $source) {
		$ABC_XMLTVID   = $XMLTV_prefix . "2"   . $XMLTV_suffix;
		$Prime_XMLTVID = $XMLTV_prefix . "7"   . $XMLTV_suffix;
		$SBS_XMLTVID   = $XMLTV_prefix . "SBS" . $XMLTV_suffix;
		$Ten_XMLTVID   = $XMLTV_prefix . "10"  . $XMLTV_suffix;
		$WIN_XMLTVID   = $XMLTV_prefix . "9"   . $XMLTV_suffix;
	} else {
		print "Unknows source '$source' for $location\n";
		exit (1);
	}
} else {
	print "Unknows location '$location'\n";
	exit (1);
}


my $opt_days;
my $opt_output;

GetOptions('days=i'        => \$opt_days,
	   'output=s'      => \$opt_output
	   );

if ($opt_days) {
	$days_to_grab = $opt_days
}

if (!($opt_output)) {
	$opt_output = $cache_dir . "/guide.xml";
}


print "$days_to_grab, $opt_output\n";
#exit(0);


my $currentday = &ParseDate("today");
my $prog_ref;
my $chan_ref;

$$chan_ref{$ABC} = {
        'id' => $ABC_XMLTVID,
        'display-name' => [ [ $ABC, undef ]]};
$$chan_ref{$Prime} = {
        'id' => $Prime_XMLTVID,
        'display-name' => [ [ $Prime, undef ]]};
$$chan_ref{$SBS} = {
        'id' => $SBS_XMLTVID,
        'display-name' => [ [ $SBS, undef ]]};
$$chan_ref{$Ten} = {
        'id' => $Ten_XMLTVID,
        'display-name' => [ [ $Ten, undef ]]};
$$chan_ref{$WIN} = {
        'id' => $WIN_XMLTVID,
        'display-name' => [ [ $WIN, undef ]]};


my @thrlist;
my $datepids = Thread::Queue->new;

print "starting threads\n";

for (my $thread=0; $thread<$threads; $thread++)
{
        push @thrlist, threads->new(\&fetch_details);
}

print "loading queue\n";

my $day_counter = 1;
while ($day_counter <= $days_to_grab)
{
	my $date = &UnixDate($currentday, "%d%m%Y");
	my @day_lines = get_day($date);
	my @pids;
	foreach my $line (@day_lines)
	{
		foreach my $link (split /\n|tr|TR|TD|tr/, $line )
		{
			if ($link =~ /closeup\/default.asp/)
			{
				$link =~ s/.+pid=//g;
				$link =~ s/".+//g;
				if ($link =~ /\d+/)
				{
#					push @pids, $link;

					my $datepid=$date . "-" . $link;
#					print "$datepid\n";
					$datepids->enqueue($datepid);	
				}
			}
		}
	}

#	push @thrlist, threads->new(\&fetch_details, $date, @pids);
	
	$day_counter++;
	$currentday = &DateCalc($currentday, "+ 1 day");
}

for (my $thread=0; $thread<$threads; $thread++)
{
	my $datepid=0 . "-" . 0;
	$datepids->enqueue($datepid);
}

print "queue is complete\n";

foreach my $thr (@thrlist)
{
    $thr->join;
}

print "all threads done\n";

$day_counter = 1;
$currentday = &ParseDate("today");
while ($day_counter <= $days_to_grab)
{
	my $date = &UnixDate($currentday, "%d%m%Y");
	my @day_lines = get_day($date);
	my @pids;
	foreach my $line (@day_lines)
	{
		foreach my $link (split /\n|tr|TR|TD|tr/, $line )
		{
			if ($link =~ /closeup\/default.asp/)
			{
				$link =~ s/.+pid=//g;
				$link =~ s/".+//g;
				if ($link =~ /\d+/)
				{
					push @pids, $link;
				}
			}
		}
	}
	
	foreach my $pid (@pids)
	{
		my @details = get_details($date, $pid);
		
		my $show_details_table = "";
		my $use_line = 0;
		foreach my $line (@details)
		{
			if ($line =~ /bgColor=#f7f3e8/)
			{
				$use_line = 0;
			}
			if ($use_line == 1)
			{
				$show_details_table .= $line;
			}
			if ($line =~ /bgcolor=#ffffff/)
			{
				$use_line = 1;
			}
			    
	    	}
	    	
	    	$show_details_table =~ s/<[^>]*>/\n/g;
		$show_details_table =~ s/\&nbsp\;//g;
		#$show_details_table =~ s/<BR>|<TR>|<TD><B><b><\/B><\/b>/\n/g;
		#$show_details_table =~ s/Genre://g;
		#$show_details_table =~ s/Rated:/\n/g;
		my $count = 0;

		my $channel = "";
		my $start_date = &UnixDate($currentday, "%Y-%m-%d");
		my $time;
		my $title1 = "";
		my $title2 = "";
		my $genre = "";
		my $descr = "";
		my $details = "";
		my $duration;


		#print $show_details_table. "\n\n\n";
		foreach my $line (split /\n/, $show_details_table)
		{
			if ($count == 4){
				#print "Time: " . $line . "\n";
				$time = $line;
			}
			elsif ($count == 7){
				$channel = $line;
				#print "Channel: " . $line . "\n";
			}
			elsif ($count == 19){
				$title1 = $line;
				#print "Program: " . $line . "\n";
			}
			elsif ($count == 20){
				$line =~ s/ - //g;
				$title2 = $line;
				#print "Subtitle: " . $line . "\n";
			}
			elsif ($count == 21){
				$line =~ s/\D//g;
				$duration = $line;
				#print "Run time: " . $line . "\n";
			}
			elsif ($count == 22){
				$line =~ s/[^A-Z]//g;
				$details = $line;
				#print "Rating: " . $line . "\n";
			}
			elsif ($count == 26){
				$line =~ s/ //g;
				$genre = $line;
				#print "Genre: " . $line . "\n";
			}
			elsif ($count == 28 && $line =~ /[a-zA-Z]/){
				$descr = $line;
				#print "Description: " . $line . "\n";
			}
			#elsif ($count == 26 && $line =~ /[a-zA-Z]/){
			#	$descr = $line;
			#	print "Description: " . $line . "\n";
			#}
			#print $count .": " . $line . "\n";
			++$count;
		}
		
		if ($count < 28)
		{
		    my $name = $cache_dir . "/" . $date . "/" . $pid . ".html";
		    print "\n$name is too short, removing and trying again\n";
		    unlink $name;
		    push @pids, $pid;
		    next;
		}
		
		my $start_time = &UnixDate($time, "%H:%M");
#		my $start_datetime = $start_date . " " . $start_time;
		if ($start_time =~ /00:|01:|02:|03:|04:|05:/)
		{
			$start_date = &DateCalc($start_date, "+ 1 day");
		}
		$start_date = &UnixDate($start_date, "%Y%m%d");
		my $end_time = &DateCalc($start_time, " + " . $duration . "minutes");
		$end_time = &UnixDate($end_time, "%H:%M");
		
		my $end_date;
		if (&Date_Cmp($start_time, $end_time) <= 0)
		{
			$end_date = $start_date;
		}
		else
		{
			my $err;
			my $edate = &DateCalc($start_date, "+ 1 day", \$err);
			$end_date = &UnixDate($edate, "%Y%m%d");
		}
		
		if ($channel =~ /$ABC/)
		{
			$channel = $ABC_XMLTVID;
		}
		elsif ($channel =~ /$Prime/)
		{
			$channel = $Prime_XMLTVID;
		}
		elsif ($channel =~ /$SBS/)
		{
			$channel = $SBS_XMLTVID;
		}
		elsif ($channel =~ /$Ten/)
		{
			$channel = $Ten_XMLTVID;
		}
		elsif ($channel =~ /$WIN/)
		{
			$channel = $WIN_XMLTVID;
		}
		my $start;
		my $stop;
		
		$start = $start_date . &UnixDate($start_time,"%H%M") . "00 " . $offset;
		$stop = $end_date . &UnixDate($end_time,"%H%M") . "00 " . $offset;
		
		my $a_prog = { 
			channel => $channel,
			start   => $start,
			stop    => $stop,
			title   => [ [ $title1, undef ] ]
		};
		
		$descr =~ s/^\s+//;
		$descr =~ s/\s+$//;

		if ($title2) { $$a_prog{'sub-title'} = [ [ $title2, undef ] ]; }
		if ($descr)  { $$a_prog{desc}        = [ [ $descr, undef ] ]; }
		if ($genre)  { $$a_prog{category}    = [ [ $genre, undef ] ]; }
		            
		push @$prog_ref, $a_prog;
	}
	
	
	
	$day_counter++;
	$currentday = &DateCalc($currentday, "+ 1 day");
}

my $data = [
    'ISO-8859-1',
    {
     'source-info-name'    => 'http://tvguide.ninemsn.com.au/',
     'generator-info-name' => 'NineMSN grabber',
     'generator-info-url'  => '',
     'generator-info-name' => "XMLTV - tv_grab_au NineMSN v0.2"
    },
    $chan_ref,
    $prog_ref
];


#my $outfile = $cache_dir . "/guide.xml";
my $outfile = $opt_output;

my $fh = new IO::File ">$outfile";
XMLTV::write_data($data, OUTPUT=>$fh);

# download the guide for the date to file
sub get_day 
{
	my $date = shift;
	my $url = $guide_url . $date . "_" . $region . ".asp";
	
	my $guide_dir = $cache_dir . "/" . $date;
	my $guide_file = $guide_dir . "/guide.html";
	mkpath ($guide_dir);

	if (!(-e $guide_file))
	{
		getstore($url, $guide_file);
		print ".";
		
		for (my $retry=1; (!(-e $guide_file)) && ($retry<$retrys); $retry++)
		{
			getstore($url, $guide_file);
			print ":";
		}
	}
	
	open(GUIDE, $guide_file) or print "\nCan't open $guide_file\n";
	my @guide_lines = <GUIDE>;
	close(GUIDE);
	return @guide_lines;
}

sub get_details
{
	my $date = shift;
	my $program_id = shift;
	
	my $url = $details_url . $program_id;
	my $guide_dir = $cache_dir . "/" . $date;
	my $details_file = $guide_dir . "/" . $program_id . ".html";
	mkpath ($guide_dir);
	
	if (!(-e $details_file))
	{
		getstore($url, $details_file);
		print ".";	
		
		for (my $retry=1; (!(-e $details_file)) && ($retry<$retrys); $retry++)
		{
			getstore($url, $details_file);
			print ":";
		}
	}
	
	open(DETAILS, $details_file) or print "\nCan't open $details_file\n";
	my @details_lines = <DETAILS>;
	close(DETAILS);
	return @details_lines;
}

sub fetch_details
{
	my $datepid=$datepids->dequeue;
	my @datepidl=split /-/, $datepid;
	
	my $date = $datepidl[0];
	my $pid = $datepidl[1];

	while (($date!=0) and ($pid!=0))
	{
		my $guide_dir = $cache_dir . "/" . $date;
		mkpath ($guide_dir);
	
		my $url = $details_url . $pid;
		my $details_file = $guide_dir . "/" . $pid . ".html";

		for (my $retry=0; (!(-e $details_file)) && ($retry<$retrys); $retry++)
		{
			getstore($url, $details_file);
		}

		$datepid=$datepids->dequeue;
		@datepidl=split /-/, $datepid;
	
		$date = $datepidl[0];
		$pid = $datepidl[1];
	}
}


More information about the mythtv-users mailing list