Country: '.$country.''; // echo 'search by number or search by keyword
'; define('START_NUMBER', '1'); $starts = (isset($_GET['starts'])) ? $_GET['starts'] : START_NUMBER; define('STOP_NUMBER', '10'); $stops = (isset($_GET['stops'])) ? $_GET['stops'] : STOP_NUMBER; echo ''; echo ''; echo ''; echo ''; echo ''; echo '
enter start numberenter stop number
'; for ( $search=$starts; $search<=$stops; $search++ ) { $url = 'http://www.imdb.com/title/tt'.sprintf("%07.0f",$search).'/'; $fh = fopen ($csv_file_to_write, 'a') or die ("can't open file"); //get the page content $imdb_content = get_data($url); //parse for product name $name = get_match('/(.*)<\/title>/isU',$imdb_content); $director = strip_tags(get_match('/<h5[^>]*>Director:<\/h5>(.*)<\/div>/isU',$imdb_content)); $plot = get_match('/<h5[^>]*>Plot:<\/h5>(.*)<\/div>/isU',$imdb_content); $genre_original = get_match('/<h5[^>]*>Genre:<\/h5>(.*)<\/div>/isU',$imdb_content); $genre_processed_once = str_replace('<a href="/Sections/Genres/','<a target="_blank" href="http://www.imdb.com/Sections/Genres/',$genre_original); $genre = str_replace('href="/title/tt','target="new" href="http://www.imdb.com/title/tt',$genre_processed_once); $keywords_original = get_match('/<h5[^>]*>Plot Keywords:<\/h5>(.*)<\/div>/isU',$imdb_content); $keywords = str_replace('<a href="/keyword/','<a target="_blank" href="http://www.imdb.com/keyword/',$keywords_original); $tagline_original = get_match('/<h5[^>]*>Tagline:<\/h5>(.*)<\/div>/isU',$imdb_content); $tagline = str_replace('href="/title/tt','target="new" href="http://www.imdb.com/title/tt',$tagline_original); $awards_original = get_match('/<h5[^>]*>Awards:<\/h5>(.*)<\/div>/isU',$imdb_content); $awards = str_replace('href="/title/tt','target="new" href="http://www.imdb.com/title/tt',$awards_original); $release_date = get_match('/<h5[^>]*>Release Date:<\/h5>(.*)<\/div>/isU',$imdb_content); $mpaa = get_match('/<a href="\/mpaa">MPAA<\/a>:<\/h5>(.*)<\/div>/isU',$imdb_content); $run_time = get_match('/Runtime:<\/h5>(.*)<\/div>/isU',$imdb_content); $user_rating = get_match('/<div class="starbar-meta">\n <b>(.*)<\/b>/isU',$imdb_content); $poster = get_match('/<a name="poster".* src="(.*)".*<\/a>/isU',$imdb_content); //$img = explode("\" ", $name); $img //$img = get_match('/title="'.$img[0].'\"" src="(.*)" \\/>/isU',$imdb_content); //echo "image:".$img; //build content $content.= '<h2>Film</h2><p>'.$name.'</p>'."\n"; if ( $poster != '' ) { $content.= '<h2>Poster</h2><p><img src="'.$poster.'" border="0" alt="'.$name.'"><br/><a href="'.$poster.'" target="_blank">link to poster</a><br/>(not authorized)</p>'."\n"; } else $content.= '<h2>Poster</h2><p>No Poster available</p>'."\n"; $content.= '<h2>Director</h2><p>'.$director.'</p>'."\n"; $content.= '<h2>Plot</h2><p>'.substr($plot,0,strpos($plot,'<a')).'</p>'."\n"; $content.= '<h2>Genre</h2><p>'.$genre.'</p>'."\n"; $content.= '<h2>Keywords</h2><p>'.$keywords.'</p>'."\n"; $content.= '<h2>Tagline</h2><p>'.$tagline.'</p>'."\n"; $content.= '<h2>Awards</h2><p>'.$awards.'</p>'."\n"; $content.= '<h2>Release Date</h2><p>'.substr($release_date,0,strpos($release_date,'<a')).'</p>'."\n"; $content.= '<h2>MPAA</h2><p>'.$mpaa.'</p>'."\n"; $content.= '<h2>Run Time</h2><p>'.$run_time.'</p>'."\n"; $content.= '<h2>User Rating</h2><p>'.$user_rating.'</p>'."\n"; $content.= '<h2>Full Details</h2><p><a href="'.$url.'" rel="nofollow" target="_blank">'.$url.'</a></p>'."\n"; // echo $content; //build csv content $csv_content .= 'tt'.sprintf("%07.0f",$search).'|'; if ( $name != '' ) { $csv_content.= str_replace('|','-',$name)."|"; } else $csv_content.= " |"; if ( $director != '' ) { $csv_content.= ltrim(rtrim(str_replace("\n",'',str_replace('|','-',$director))))."|"; } else $csv_content.= " |"; if ( $plot != '' ) { $csv_content.= ltrim(rtrim(str_replace('"','"',str_replace("\n",'',str_replace('|','-',substr($plot,0,strpos($plot,'<a')))))))."|"; } else $csv_content.= " |"; if ( $release_date != '' ) { $csv_content.= ltrim(rtrim(str_replace("\n",'',str_replace('|','-',substr($release_date,0,strpos($release_date,'<a'))))))."|"; } else $csv_content.= " |"; if ( $run_time != '' ) { $csv_content.= ltrim(rtrim(str_replace("\n",'',str_replace('|','-',$run_time))))."|\n"; } else $csv_content.= "\r\n"; // if ( $run_time != '' ) { $csv_content.= ltrim(rtrim(str_replace("\n",'',str_replace('|','-',$run_time))))."|"; } else $csv_content.= " |"; // if ( $url != '' ) { $csv_content.= str_replace("\n",'',str_replace('|','-',$url))."|\n"; } else $csv_content.= "\r\n"; $csv_preg_replace = preg_replace('#<a.*</a>#','\\1',$csv_content); $csv_content_clean = str_replace(' »','',str_replace('</span>','',str_replace('<div class="info-content">','',str_replace('<div id="tn15plotkeywords" style="display:inline;"><span>','',$csv_preg_replace)))); $csv_content = ''; fwrite($fh, $csv_content_clean); fclose($fh); } //gets the match content function get_match($regex,$content) { preg_match($regex,$content,$matches); return $matches[1]; } //gets the data from a URL function get_data($url) { $ch = curl_init(); $timeout = 5; curl_setopt($ch,CURLOPT_URL,$url); curl_setopt($ch,CURLOPT_RETURNTRANSFER,1); curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,$timeout); $data = curl_exec($ch); curl_close($ch); return $data; } ?> </body> </html>