curl: can't fetch rss from website because of CloudFlare
Asked Answered
C

3

6

I'm notable to connect this site http://www.youm7.com/newtkarirrss.asp using curl on the server

But i can access it from localhost with out any problem

Here is the test

http://www.tjreb.com/xml_grabber.php?feed=http://www.youm7.com/newtkarirrss.asp&stack=1

Try The CNN rss feed

http://www.tjreb.com/xml_grabber.php?feed=http://rss.cnn.com/rss/edition_meast.rss&stack=0

How can i bypass this error

Here is my source code

<?php
  class xml_grabber
        {
            private $xml_file       = '' ;
            private $xml_link       = '' ;
            private $xml_dom        = '' ;
            private $xml_type       = '' ;
            private $xml_content    = '' ;
            private $xml_errors     = array() ;
            public  $xml_stack      = 0  ;

            public function __construct($link_file_com = '')
                   {
                       if(!$link_file_com)
                            {
                              $this->xml_errors['construct'] = 'No Xml In Construct' ;
                              return false;
                            }
                       elseif(!function_exists('simplexml_load_file') || !function_exists('simplexml_load_string') || !function_exists('simplexml_import_dom'))
                            {
                              $this->xml_errors['functions'] = 'simple xml function not exists' ;
                              return false;
                            }
                       else
                            {
                             $this->set_xml($link_file_com) ;
                            }
                      // ini_set('memory_limit', '100M');
                   }

             public function set_xml($xml)
                   {
                      if(isset($xml{3}))
                        {
                           if(file_exists($xml))
                              {
                                $this->xml_type = 1 ;
                                $this->xml_file = $xml ;
                              }
                          elseif(filter_var($xml, FILTER_VALIDATE_URL))
                              {
                                $this->xml_type = 2 ;
                                $this->xml_link = $xml ;
                              }
                          else
                              {
                                $this->xml_type = 3 ;
                                $this->xml_dom  = $xml ;
                              }
                        }
                      else
                        {
                          $this->xml_type = '' ;
                        }
                   }

             public function get_xml()
                   {
                      if($this->xml_type == '')
                            {
                              return false ;
                            }
                      elseif($this->xml_type == 1)
                            {
                              return $this->xml_file ;
                            }
                      elseif($this->xml_type == 2)
                            {
                              return $this->xml_link ;
                            }
                      elseif($this->xml_type == 3)
                            {
                              return $this->xml_dom ;
                            }
                   }

             public function set_columns($new_columns= array())
                   {
                      return $this->xml_columns = $new_columns ;
                   }
             public function get_columns()
                   {
                     return $this->xml_columns ;
                   }

             public function load()
                   {
                     if($this->xml_type == '')
                            {
                              $this->xml_errors['loader'] = 'Unknown XML type' ;
                              return false;
                            }
                      elseif($this->xml_type == 1)
                            {
                              $dom = simplexml_load_file($this->xml_file,null, LIBXML_NOCDATA) ;
                              $this->xml_content = $dom ;
                            }
                      elseif($this->xml_type == 2)
                            {
                               $con = $this->connect($this->xml_link);
                               if($this->xml_stack == 1)
                                    {
                                       echo $con; die();
                                    }       
                               $this->xml_content = simplexml_load_string($con,null, LIBXML_NOCDATA) ;
                            }
                      elseif($this->xml_type == 3)
                            {
                              return $this->xml_dom ;
                            }
                   }

             public function fetch($return = 'array')
                    {
                        if($this->xml_content != '')
                            {
                               $rss_feed = $this->xml_content ;

                               $rss_title = (string) $rss_feed->channel->title ;
                               $rss_link  = (string) $rss_feed->channel->link  ;
                               $rss_cat   = (string) $rss_feed->channel->category  ;
                               $rss_image = (string) $rss_feed->channel->image->url  ;

                               $rss_summary =
                                            array
                                            (
                                              'info' =>
                                                        array(
                                                                'title'=>$rss_title ,
                                                                'link'=>$rss_link ,
                                                                'cat'=>$rss_cat ,
                                                                'image'=>$rss_image
                                                                ) ,
                                              'item' =>  array()

                                            ) ;



                               foreach($rss_feed->channel->item as $item)
                                       {

                                           if($item->enclosure && $item->enclosure->attributes())
                                                {
                                                    $image0 = $item->enclosure->attributes() ;
                                                    $image_url = $image0 ['url'] ;
                                                }

                                          $rss_summary['item'][] =
                                                                    array(
                                                                        'title' => (string) $item->title ,
                                                                        'description' => (string) $item->description ,
                                                                        'link' => (string) $item->link ,
                                                                        'date' => (string) $item->pubDate ,
                                                                        'image' => (string) $item->image ,
                                                                        'image2' =>  (string) $image0
                                                                    ) ;
                                       }

                                if($return == 'json')
                                       {
                                         return json_encode($rss_summary) ;
                                       }
                                elseif($return == 'serialize')
                                       {
                                         return serialize($rss_summary) ;
                                       }
                                elseif($return == 'xml')
                                       {
                                         return xml_encode($rss_summary) ;
                                       }
                                else
                                       {
                                         return $rss_summary ;
                                       }

                            }
                        else
                            {
                              $this->xml_errors['fetch'] = 'No Xml Content' ;
                            }
                    }

             protected function connect($link)
                    {
                      if(!filter_var($link, FILTER_VALIDATE_URL))
                              {
                                $this->xml_errors['connect'] = 'Not Vaild Link To Get data' ;
                                return false ;
                              }
                      if(function_exists('curl_init'))
                           {
                             $cu = curl_init();
                             curl_setopt($cu, CURLOPT_URL, $link);
                             curl_setopt($cu, CURLOPT_SSL_VERIFYPEER, false);
                             curl_setopt($cu, CURLOPT_SSL_VERIFYHOST, false);
                             //curl_setopt($cu, CURLOPT_REFERER, "http://www.tjreb.com");
                             //curl_setopt($cu, CURLOPT_HEADER, true);
                 //curl_setopt($cu, CURLOPT_FOLLOWLOCATION, false);
                 curl_setopt($cu, CURLOPT_RETURNTRANSFER, TRUE);
                             $co = curl_exec($cu) ;
                               if($co)
                                    {
                                        $con = $co ;
                                    }
                               else
                                    {
                                      $this->xml_errors['connect'] = 'No Result From Curl' ;
                                      $this->xml_errors['curl']  = curl_error($cu);
                                    }
                              curl_close($cu) ;
                              return $con ;
                           }

                    if(!$con and function_exists('ini_get'))
                        {

                             $url_fopen = ini_get('allow_url_fopen') ;

                             if($url_fopen == 0)
                                {
                                   if(function_exists('ini_set'))
                                        {
                                          ini_set('allow_url_fopen', 1) ;
                                        }
                                   $check_fopen = 1 ;
                                }
                             else
                                {
                                   $check_fopen = 0 ;
                                }

                             if($check_fopen == 1)
                                {
                                  $url_fopen = ini_get('allow_url_fopen') ;
                                }

                             if($url_fopen == 1)
                                {

                                  if(function_exists('file_get_contents') and !$con)
                                   {
                                        $con = @file_get_contents($link) ;
                                        if($con)
                                            {
                                              return $con ;
                                            }
                                       else
                                            {
                                              $this->xml_errors['connect'] = 'No Result From file_get_contents' ;
                                            }
                                   }

                              elseif(function_exists('readfile') and !$con)
                                   {
                                        $con = @readfile($link);
                                        if($con)
                                            {
                                              return $con ;
                                            }
                                       else
                                            {
                                              $this->xml_errors['connect'] = 'No Result From readfile' ;
                                            }
                                   }

                              elseif(function_exists('file') and !$con)
                                   {
                                        $con = @file($link)  ;
                                        if($con)
                                            {
                                              return  $con ;
                                            }
                                       else
                                            {
                                              $this->xml_errors['connect'] = 'No Result From file' ;
                                            }
                                   }

                                }
                        }

                      if(!$con)
                           {
                             $this->xml_errors['connect'] = 'Curl And Allow Url Fopen Disabled On Server' ;
                             return false ;
                           }
                    }

             public function get_error()
                    {
                       return $this->xml_errors ;
                    }

        }


if(isset($_GET['feed']))
    {
       $url = addslashes($_GET['feed']) ;
    }
else
    {
       $url = 'http://rss.cnn.com/rss/edition_meast.rss' ;
    }

$fetch = $_GET['fetch'] ;
$stack = $_GET['stack'] ;

$xml = new xml_grabber($url) ;

/*
 http://www.youm7.com/new3agelrss.asp
 http://www.youm7.com/newtkarirrss.asp
 http://www.almasryalyoum.com/rss_feed_term/223241/rss.xml
 http://gdata.youtube.com/feeds/api/playlists/18A7E36C33EF4B5D?v=2
 http://rss.cnn.com/rss/edition_meast.rss
 https://www.facebook.com/feeds/page.php?format=atom10&id=40796308305
 https://www.facebook.com/feeds/page.php?format=rss20&id=40796308305
 http://www.fwasl.com/feed
 https://www.facebook.com/feeds/page.php?format=atom10&id=378156838895039
 */

if(isset($stack))
{
  $xml -> xml_stack = intval($stack) ; 
}   


$res    = $xml -> load()    ;

$result = $xml -> fetch($fetch)   ;

if($result)
    {
      print_r ( $result ) ;
    }
else
    {
      print_r ( $xml->get_error() ) ;
    }

?>
Caaba answered 9/8, 2012 at 15:26 Comment(1)
my project mujaz.awcore.comYmir
H
5

You can't easily bypass Cloudflare. However you can hack the protection system. :)

First, parse the page (Cloudflare protection page) and calculate 3+13*7 (most probably this will be different for each request.) in

$(function(){setTimeout(
            function(){
                $('#jschl_answer').val(3+13*7);
                $('#ChallengeForm').submit();
            },
            5850
)});

Then send post request the same page with "jschl_vc" value from #ChallengeForm which you got from parsed data and "jschl_answer" value as 3+13*7. And then try to fetch the page again with the cookie value that Cloudflare added. When you're added Cloudflare whitelist, you won't see that page anymore.

Histrionics answered 10/8, 2012 at 3:58 Comment(6)
Thank you @burak emre for answer i liked your answer , but i did it with out result i sent request to the server and got claudflare page so i got the captcha value from jquery val and sent it with the other inputs in the form with curl post method ,Caaba
i think the server should save cookie , but i don't know how curl return cookies or work with itCaaba
@MonaAbdelmajeed take a look at here: #896286Histrionics
I Tried Your Idea to re post data to curl and get cookie No Result from cookie , here is the url of test tjreb.com/xml_grabber.php?feed=http://www.youm7.com/… I just add some information in the page , here is the header parse that i used https://mcmap.net/q/158537/-how-to-get-the-cookies-from-a-php-curl-into-a-variable ... Waiting Your AnswerCaaba
Actually it returns a cookie value. For example the page that you shared must be requested with "__cfduid=dae00b9d3a19db1891fb83e3f7fd5d15d1345008603;". (Take a look at source code of the page) Could you please check again?Histrionics
It wasn't work , untill i contact this website server admin , but your idea very useful :) , Thank you :)Caaba
R
4

You can pass cloudflare protection with PhantomJS http://phantomjs.org/ which can execute the cloudflare JS outside a browser with following little script "delay.js":

"use strict";
var page = require('webpage').create(),
    system = require('system'),
    address, delay;

if (system.args.length < 3 || system.args.length > 5) {
    console.log('Usage: delay.js URL delay');
    phantom.exit(1);
} else {
    address = system.args[1];
    delay = system.args[2];
    page.open(address, function (status) {
        if (status !== 'success') {
            console.log('Unable to load the address!');
            phantom.exit(1);
        } else {
            window.setTimeout(function () {
                var content = page.content;
                console.log(content);
                phantom.exit();
            }, delay);
        }
    });
}

run it as phantomjs delay.js http://protected.url 5000

This will get "protected.url" and wait 5000ms for the cloudflare code to load the real page and dumps it to stdout.

Ritch answered 7/12, 2016 at 12:11 Comment(0)
R
-3

You need to tell their site what browser your using.

curl_setopt ($cu, CURLOPT_USERAGENT, $user_agent);

e.g. Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4) Gecko/20030624 Netscape/7.1 (ax)

or use the current users own browser agent using $_SERVER['HTTP_USER_AGENT']

Renn answered 9/8, 2012 at 15:51 Comment(2)
whistling quietly to himself pondering the meaning of life and stuff ooh! look a tumbleweed!Renn
Haven't used Cloudflare so sorry this didn't help. Glad @burak-emre was able to solve this +1Renn

© 2022 - 2024 — McMap. All rights reserved.