[mythtv-users] Bell Expressvu PPV Scraper script

Andrew Saunders saunders at pagpaintball.com
Wed May 10 18:03:29 EDT 2006


For some reason it didn't post my attachment the first time. A few 
people emailed so I'll repost it again. The info on this is back here 
http://www.gossamer-threads.com/lists/mythtv/users/203199#203199

<?php

    /*
     
     Bell ExpressVu PPV Scraper for MythTV v0.19 - 0.19.1 svn
     by Andrew Saunders (saunders at pagpaintball.com)
     
     ChangeLog:
         April 29, 2006 - v0.2 - minor fixes plus updates for the minor 
changes to the guide.
         April 24, 2006 - v0.1 - fully working version
     
     Installation:
         step 1 - this requires the CLI version of PHP5 (may work with 
4), must include mysqli and cURL support.
         step 2 - select which channels you want from labs.zap2it.com or 
another source and run "mythfilldatabase".
         step 3 - set config info (below) and select which PPV channels 
you wish to update from below.
         step 4 - run from either a web browser or commandline (use 
browser if debugging).
         step 5 - enjoy having ppv info in your guide.
        
     ToDo:
         - experiment with curl_multi to grab multiple pages at once.
        
     === NOTE : THIS IS A BETA RELEASE, IT WORKS FOR ME, YMMV! ===
     
    */
   
    /* 
------------------------------------------------------------------------
        CONFIG
    
------------------------------------------------------------------------ */
   
    $debugLevel = 4;            // 0=no output, 1=minimal, 2=verbose, 
(3&4 debugging only!) 3=include runtime variables, 4=all runtime info
    $daysToScrape = 2;            // how many days to scrape (read note 
above first)
    $timezone = 'AST';            // options are : PST MST CST EST AST NEWF
    $mysqlName = 'root';        // mysql name
    $mysqlPass = '';            // mysql password
    $mysqlDB = 'mythconverg';    // mythtv db name
    $mysqlAddr = 'localhost';    // db address
    $sourceID = 3;                // sourceID to update (find this in 
mythweb > edit settings > channel info)
    $channelsToScrape = array();
    // uncomment for which channels you want scraped (for speed's sake 
please only select what you watch and if it's in season)
    // french PPV
    //array_push($channelsToScrape, 156, 157, 161, 162, 163, 164, 165, 
166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177);
    // english PPV
    array_push($channelsToScrape, 351, 352, 353, 354, 355, 356, 357, 
358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 
372, 373, 374, 375, 376, 377, 378, 379, 380, 381);
    // soccer
    //array_push($channelsToScrape, 403);
    // nhl
    //array_push($channelsToScrape, 425, 426, 427, 428, 429, 430, 431, 
432, 433, 434, 435, 436, 437, 438);
    // nascar
    //array_push($channelsToScrape, 440, 441, 442, 443, 444, 445, 446);
    // nfl
    //array_push($channelsToScrape, 451, 452, 453, 454, 455, 456, 457, 
458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468);
    // hpi tv
    //array_push($channelsToScrape, 475, 476, 477);
    // cricket
    //array_push($channelsToScrape, 703);
    // poland
    //array_push($channelsToScrape, 711);
    // kids PPV
    array_push($channelsToScrape, 560, 561);
    // venus PPV
    //array_push($channelsToScrape, 749, 750, 751, 752, 753, 754, 755, 
756, 757, 758, 759, 760);
    // HD PPV
    //array_push($channelsToScrape, 830, 831, 832, 833);

    /* 
------------------------------------------------------------------------
        DO NOT EDIT BELOW THIS LINE UNLESS YOU KNOW WHAT YOUR DOING
    
------------------------------------------------------------------------ */

    set_time_limit(60 * 60);    // 1hr max runtime for script to finish
    // setup mysql
    $dbi = new mysqli($mysqlAddr, $mysqlName, $mysqlPass, $mysqlDB) or 
die('Could not connect: ' . mysql_error());
    $stmt = $dbi->stmt_init();
    // setup cURL
    $ch = curl_init();
    curl_setopt($ch, CURL_HTTP_VERSION_1_1, true);
    curl_setopt($ch, CURLOPT_TIMEOUT, 30);
    curl_setopt($ch, CURLOPT_FAILONERROR, true);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');
    curl_setopt($ch, CURLOPT_USERAGENT, 'User-Agent: Mozilla/5.0 
(Windows; U; Windows NT 5.1; en-US; rv:1.8.0.2) Gecko/20060308 
Firefox/1.5.0.2');
    // uncomment below to generate a new cookie every time
    curl_setopt($ch, CURLOPT_COOKIEJAR, '-');
    // or uncomment below to save cookies between instances
    //$cookieFile = '/dir/to/xvu_cookie.txt';
    //curl_setopt($ch, CURLOPT_COOKIEJAR, $cookieFile);
    //curl_setopt($ch, CURLOPT_COOKIEFILE, $cookieFile);
    // uncomment below for extra debugging info
    //curl_setopt($ch, CURLOPT_VERBOSE, true);
    //curl_setopt($ch, CURLOPT_HEADER, true);
    $curlMaxRetries = 10;            // maximum number of times it will 
retry to get an existing page
    $curlMaxAbort = 20;                // maximum number of retries 
before failing out
    $commandLine = isset($argv);    // true/false if running from the 
commandline

    /* 
------------------------------------------------------------------------
        RETRIEVE ALL THE PPV IDS
    
------------------------------------------------------------------------ */

    $currentDay = 0;
    $currentHour = 1;
    $ppvIDArray = array();
    // get initial cookie
    $html = false;
    $retries = 0;
    while (($html === false) && (++$retries <= $curlMaxRetries)) {
        checkRetries();
        // site returned nothing. hit it again
        printDebugInfo('retreiving initial page for cookie :: attempt ' 
. $retries . ' / ' . $curlMaxRetries, 1);
        curl_setopt($ch, CURLOPT_URL, 
'http://www.bell.ca/ExpressVuEPG/loadVuGuide.do?lang=en');
        $html = curl_exec($ch);
        printDebugInfo($html, 4);
        if (preg_match("/Sorry, due to technical difficulties this 
function is not available/", $html)) {
            //Sorry, due to technical difficulties this function is not 
available at this time. Please try again later.
            printDebugInfo('guide is down, try again later...', 1);
            //exit();
            $html = false;
        }
    }
    if ($html === false) {
        printDebugInfo('unable to retrieve initial page, aborting...', 1);
        exit();
    }
       
       
    // caculate the starting/ending channel
    $startingChannel = 1000;
    $endChannel = 0;
    foreach($channelsToScrape as $junk => $channel) {
        if ($channel < $startingChannel) {
            $startingChannel = $channel;
        }
        if ($channel > $endChannel) {
            $endChannel = $channel;
        }
    }
    while ($currentDay < $daysToScrape) {
        $pageDown = false;
        do {
            $html = false;
            $retries = 0;
            while (($html === false) && (++$retries <= $curlMaxRetries)) {
                if (!$pageDown) {
                    printDebugInfo('retrieving list of PPVs :: ' .
                        'day ' . ($currentDay + 1) . ' / ' . $daysToScrape .
                        ', hour ' . date('H:i', 
mktime((($currentHour+1)/2),((($currentHour-1)%2)*30),0,1,1,2000)) .
                        ' - ' . date('H:i', 
mktime((($currentHour+1+4)/2),((($currentHour-1)%2)*30),0,1,1,2000)) .
                        ', attempt ' . $retries . ' / ' . 
$curlMaxRetries, 1);
                } else {
                    printDebugInfo('retrieving next page of channels :: 
attempt '. $retries . ' / ' . $curlMaxRetries, 2);
                }
                checkRetries();
                if (!$pageDown) {
                    // send POST, follow redirect (automatically) and 
get page
                    // selectedStartTime :: 1=1am, 2=1:30am, 3=2:00am, 
..., 46=11:30pm, 47=12:00am, 48=12:30am
                    // selectedDay :: 0=current day, 1=next day, 2=two 
days later, ..., 13=...
                    // gotoChannel :: starting channel
                    curl_setopt($ch, CURLOPT_URL, 
'http://www.bell.ca/ExpressVuEPG/submitSearchFilter.do');
                    curl_setopt($ch, CURLOPT_REFERER, 
'http://www.bell.ca/ExpressVuEPG/loadVuGuide.do?lang=en');
                    curl_setopt($ch, CURLOPT_POST, true);
                    curl_setopt($ch, CURLOPT_POSTFIELDS, 
'favID=&favName1=&favName2=&favName3=&favName4=&favKeyword1=&favKeyword2=&favKeyword3=&favKeyword4=&favNetwork1=&favNetwork2=&favNetwork3=&favNetwork4=&favTheme1=&favTheme2=&favTheme3=&favTheme4=&selectedTheme=1%2C0%2C0%2C0%2C0%2C0%2C0%2C0&progID=&progTZ=&selectedKeyword=&selectedNetwork=&cbAll=on&orderbyName=&orderbyNo=asc&searchWin=1&userTimeZone=0&selectedFavName=&selectedDay=' 
. $currentDay . '&selectedStartTime=' . $currentHour . 
'&selectedTimeZone=' . $timezone . '&gotoChannel=' . $startingChannel . 
'&buttonPressed.x=16&buttonPressed.y=8');
                    $html = curl_exec($ch);
                    printDebugInfo($html, 4);
                } else {
                    // send command to move down a page
                    curl_setopt($ch, CURLOPT_URL, 
'http://www.bell.ca/ExpressVuEPG/submitChangeView.do?buttonPressed=DOWN');
                    curl_setopt($ch, CURLOPT_REFERER, 
'http://www.bell.ca/ExpressVuEPG/submitSearchFilter.do');
                    curl_setopt($ch, CURLOPT_POST, false);
                    $html = curl_exec($ch);
                    printDebugInfo($html, 4);
                }
                if (preg_match("/Sorry, due to technical difficulties 
this function is not available/", $html)) {
                    printDebugInfo('guide is down, trying again', 2);
                    $html = false;
                }
            }
            $html = preg_replace("/([\r\n]|&nbsp;)/m", "", $html);    // 
remove newlines and junk
            $html = preg_replace("/^.*?start of dynamic rows(.*?)end of 
dynamic rows.*$/", "$1", $html);    // trim
            $html = preg_replace("/[ \t]*([<>])[ \t]*/", "$1", 
$html);    // tighten
            $htmlArray = preg_split("/<\/tr>/", $html);    // now split 
up this way via </tr>
            foreach($htmlArray as $htmlPart) {
                if (preg_match("/<td bgcolor=\"#E5F2F8\" width=\"4%\" 
align=\"center\">\d+<\/td>/", $htmlPart)) {
                    $channel = preg_replace("/^.*?<td 
bgcolor=\"#E5F2F8\" width=\"4%\" align=\"center\">(\d+)<\/td>.*$/", 
"$1", $htmlPart);
                    if ($channel > $endChannel) {
                        // no need to go any further
                        //continue 2;
                    }
                    printDebugInfo('channel : ' . $channel, 3);
                    if (in_array($channel, $channelsToScrape)) {
                        // if this is a channel we want then rip out ppv 
links
                        unset($matches);
                        
preg_match_all("/javascript:popupwin(?:PPV)?\(\'(\d+)\',/m", $htmlPart, 
$matches);
                        printDebugInfo('ppv ids from page ($matches)', 3);
                        printDebugInfo($matches, 3);
                        foreach($matches[1] as $match) {
                            // this will automatically force uniqueness 
and set the movie to the first day found
                            if (!isset($ppvIDArray[$match])) {
                                // annoyingly I have to set the day 
manually as it isn't _anywhere_ on the info page...
                                if ($currentHour >= 47) {
                                    // if it's past midnight it's the 
next day
                                    $ppvIDArray[$match] = array('day' => 
($currentDay + 1));
                                } else {
                                    $ppvIDArray[$match] = array('day' => 
$currentDay);
                                }
                            }
                        }
                    }
                }
            }
            $pageDown = true;    // start heading down through the pages
        } while 
(preg_match("/javascript:popupwin(?:PPV)?\(\'(\d+)\',/m", $html));    // 
while PPVs still listed
        // adjust time
        $currentHour += 4;    // add 2 hours (2 hours listed per page)
        if ($currentHour > 48) {
            // past the maximum 48 so roll to the next day
            $currentHour = $currentHour % 48;
            $currentDay++;
        }
    }
    printDebugInfo('all ppv ids found ($ppvIDArray)', 3);
    printDebugInfo($ppvIDArray, 3);
    printDebugInfo('finished retrieving PPV list, ' . count($ppvIDArray) 
. ' PPVs found', 1);
   
    /* 
------------------------------------------------------------------------
        RETRIEVE ALL INDIVIDUAL PPV INFORMATION
    
------------------------------------------------------------------------ */
   
    foreach($ppvIDArray as $ppvID => $ppvInfo) {
        printDebugInfo('retrieving PPV ' . $ppvID, 2);
        curl_setopt($ch, CURLOPT_POST, false);
        curl_setopt($ch, CURLOPT_REFERER, 
'http://www.bell.ca/ExpressVuEPG/submitSearchFilter.do');
        curl_setopt($ch, CURLOPT_URL, 
'http://www.bell.ca/ExpressVuEPG/vuDetails.do?code=' . $ppvID . 
'&tzcode=' . $timezone);
        $html = false;
        $retries = 0;
        while (($html === false) && (++$retries <= $curlMaxRetries)) {
            checkRetries();
            $html = curl_exec($ch);
            printDebugInfo($html, 4);
        }
        $html = preg_replace("/([\r\n]|&nbsp;)/m", "", $html);    // 
remove newlines and junk
        $html = preg_replace("/^.*?<Body(.*?)Your Bell ExpressVu 
PIN.*$/", "$1", $html);    // trim
        $html = preg_replace("/[ \t]*([<>])[ \t]*/", "$1", $html);    // 
tighten
        // rip out info
        $title = preg_replace("/^.*?<td valign=\"top\" 
align=\"left\"><div class=\"bigblueBoldText\">(.*?)<\/div>.*$/m", "$1", 
$html);
        $channel = preg_replace("/^.*?>Channel: .*? - (\d+).*$/m", "$1", 
$html);
        $startTime = preg_replace("/^.*?>Start Time: (\d?\d:\d\d .M) 
.*$/m", "$1", $html);
        $endTime = preg_replace("/^.*?>End Time: (\d?\d:\d\d .M) .*$/m", 
"$1", $html);
        $description = $rating = $cost = '';
        if (preg_match("/>Description of the show:</", $html)) {
            $description = preg_replace("/^.*?>Description of the 
show:<\/div><div class=\"blueText\">(.*?)<\/div>.*$/m", "$1", $html);
            $description = 
preg_replace("/^\(\d{2}:\d{2}[ap]m[^\)]*?\)(.*)$/m", "$1", 
$description);    // remove useless date from description
        }
        if (preg_match("/>Rating:</", $html)) {
            $rating = preg_replace("/^.*?>Rating:<\/div><div 
class=\"blueText\">([^<]*?)<\/div>.*$/m", "$1", $html);
            $rating = preg_replace("/([ ]+,)/", "", $rating);    // 
clean up junk
        }
        if (preg_match("/>[\$](\d+\.\d\d)</", $html)) {
            $cost = preg_replace("/^.*?<div 
class=\"blueText\">[\$](\d+\.\d\d)<\/div>.*$/m", "$1", $html);
        }
        if (in_array($channel, $channelsToScrape)) {
            // if we want this channel then insert into array
            $ppvIDArray[$ppvID] = array (
                'day' => $ppvInfo['day'],
                'title' => html_entity_decode($title),
                'channel' => $channel,
                'starttime' => $startTime,
                'endtime' => $endTime,
                'description' => html_entity_decode($description),
                'rating' => html_entity_decode($rating),
                'cost' => $cost    );
        } else {
            unset($ppvIDArray[$ppvID]);
        }
        printDebugInfo("$title :: $channel :: $startTime :: $endTime :: 
$description :: $rating :: $cost", 3);
    }
    printDebugInfo('all info to be inserted into myth ($ppvIDArray)', 3);
    printDebugInfo($ppvIDArray, 3);
   
    /* 
------------------------------------------------------------------------
        SAVE PPV INFO TO MYTHTV DATABASE
    
------------------------------------------------------------------------ */
   
    printDebugInfo('inserting ppv info into myth', 1);
   
    // get chanid for each individual channel
    printDebugInfo('retrieving channel info from myth db', 3);
    $channelsInMyth = array();
    foreach($channelsToScrape as $junk => $channel) {
        $sql = 'SELECT chanid, channum FROM channel WHERE sourceid = ? 
AND channum = ?';
        if ($stmt->prepare($sql)) {
            $stmt->bind_param('ii', $sourceID, $channel);
            $stmt->execute();
            $stmt->store_result();    // buffer everything
        }
        if ($stmt->errno) {
            die($stmt->error);
        } else {
            $stmt->bind_result($chanid, $channum);
            $stmt->fetch();
            if ($channum) {
                $channelsInMyth[$channum] = $chanid;
            }
        }
        $stmt->free_result();
    }
   
    // delete all channel lineups
    printDebugInfo('deleting all previous info from ppv channels', 3);
    $sqls = array(
        'DELETE FROM program WHERE chanid = ?',
        'DELETE FROM programgenres WHERE chanid = ?',
        'DELETE FROM programrating WHERE chanid = ?');
    foreach($channelsInMyth as $channum => $chanid) {
        foreach ($sqls as $sql) {
            if ($stmt->prepare($sql)) {
                $stmt->bind_param('i', $chanid);
                $stmt->execute();
            }
            if ($stmt->errno) {
                die($stmt->error);
            }
            $stmt->free_result();
        }
    }
   
    printDebugInfo('inserting individual ppv info', 3);
    // insert into myth
    foreach($ppvIDArray as $ppvID => $ppvInfo) {
        if (isset($channelsInMyth[$ppvInfo['channel']])) {
            $chanid = $channelsInMyth[$ppvInfo['channel']];
            $currentDate = date('Y-m-d', mktime(0, 0, 0, date("m"), 
date("d")+$ppvInfo['day'], date("Y")));
            $starttime = date('Y-m-d G:i:s', (strtotime($currentDate . ' 
' . $ppvInfo['starttime'])));
            if ((strtotime($currentDate . ' ' . $ppvInfo['starttime'])) 
 > (strtotime($currentDate . ' ' . $ppvInfo['endtime']))) {
                // the endtime lands on the next day
                $currentDate = date('Y-m-d', mktime(0, 0, 0, date("m"), 
date("d")+$ppvInfo['day']+1, date("Y")));
            }
            $endtime = date('Y-m-d G:i:s', (strtotime($currentDate . ' ' 
. $ppvInfo['endtime'])));
            $genre = 'PPV';        // acceptable?
            $closecaptioned = 0;
            $stars = 0;
            $stereo = 1;
            $title = $ppvInfo['title'];
            $description = $ppvInfo['description'];
            if ($ppvInfo['rating']) {
                $description .= "\r\n" . 'Rating: ' . $ppvInfo['rating'];
            }
            if ($ppvInfo['cost']) {
                $description .= "\r\n" . 'Cost: $' . $ppvInfo['cost'];
            }
            $channum = $ppvInfo['channel'];
            // try and detect some ratings
            if (preg_match("/\(G\)/", $ppvInfo['rating'])) {
                $rating = 'G';
            } elseif (preg_match("/\(PG\)/", $ppvInfo['rating'])) {
                $rating = 'PG';
            } elseif (preg_match("/\(R\)/", $ppvInfo['rating'])) {
                $rating = 'R';
            } else {
                $rating = 'NR';
            }
            // insert into sql
            $sql = 'INSERT INTO program (starttime, endtime, chanid, 
category, closecaptioned, stars, stereo, title, description) VALUES (?, 
?, ?, ?, ?, ?, ?, ?, ?)';
            if ($stmt->prepare($sql)) {
                $stmt->bind_param('ssisidiss', $starttime, $endtime, 
$chanid, $genre, $closecaptioned, $stars, $stereo, $title, $description);
                $stmt->execute();
            }
            if ($stmt->errno) {
                //die($stmt->error);
                printDebugInfo('mysql : ' . $stmt->error, 1);
            }
            $stmt->free_result();
            $sql = 'INSERT INTO programgenres (starttime, chanid, genre) 
VALUES (?, ?, ?)';
            if ($stmt->prepare($sql)) {
                $stmt->bind_param('sis', $starttime, $chanid, $genre);
                $stmt->execute();
            }
            if ($stmt->errno) {
                //die($stmt->error);
                printDebugInfo('mysql : ' . $stmt->error, 1);
            }
            $stmt->free_result();
            $sql = 'INSERT INTO programrating (starttime, chanid, 
rating) VALUES (?, ?, ?)';
            if ($stmt->prepare($sql)) {
                $stmt->bind_param('sis', $starttime, $chanid, $rating);
                $stmt->execute();
            }
            if ($stmt->errno) {
                //die($stmt->error);
                printDebugInfo('mysql : ' . $stmt->error, 1);
            }
            $stmt->free_result();
        }
    }
   
    printDebugInfo('done...', 1);
    curl_close($ch);
    exit();
    // done
   
    /* 
------------------------------------------------------------------------
        FUNCTIONS
    
------------------------------------------------------------------------ */

    /**
     * output to either web browser or console
     *
     * @param string/array $text
     */
    function printDebugInfo($text, $textDebugLevel) {
        global $debugLevel, $commandLine;
       
        if ($textDebugLevel <= $debugLevel) {
            if ($commandLine) {
                // running from command line
                if (is_array($text)) {
                    print_r($text) . "\n";
                } else {
                    echo $text . "\n";
                }
            } else {
                // running from a browser
                if ($textDebugLevel == 3) {
                    echo '<div style="margin: 2px 0; padding-left:10px; 
background-color:#EEEEEE; border:1px solid #0000FF;">';
                } elseif ($textDebugLevel == 4) {
                    echo '<div style="margin: 2px 0; padding-left:10px; 
background-color:#EEEEEE; border:1px solid #FF0000;">';
                }
                if (is_array($text)) {
                    echo '<pre>';
                    print_r($text);
                    echo '</pre><br />';
                } else {
                    echo nl2br(htmlentities($text)) . '<br />';
                }
                if ($textDebugLevel >= 3) {
                    echo '</div>';
                }
                flush();    // send to browser immediately
            }
        }
    }
   
    /**
     * keep track of how many "retrys", a function just to keep code clean
     *
     */
   
    function checkRetries() {
        global $retries, $curlMaxAbort;
       
        if ($retries > 1) {
            if ($curlMaxAbort-- == 0) {
                printDebugInfo('too many pages failed, aborting...', 1);
                exit();
            }
            // a page failed so wait 3 seconds
            printDebugInfo('page retrieval failed, waiting 3 seconds 
before retrying', 3);
            //sleep(3);
        }
    }
   
?>



More information about the mythtv-users mailing list