
Zunächst das Herzstück, die Webbot-Funktion

Code: Alles auswählen
<?php
function webbot( $url, $method=0, $fields=0, $lighttpd=0, $proxy=0 )
{
$socket = curl_init();
CURL_SETOPT($socket, CURLOPT_URL, $url);
CURL_SETOPT($socket, CURLOPT_FOLLOWLOCATION, 2);
CURL_SETOPT($socket, CURLOPT_RETURNTRANSFER, 1);
( ( $lighttpd ) ? curl_setopt($socket, CURLOPT_HTTPHEADER, array("Expect:")) : '' );
( ( $method ) ? CURL_SETOPT($socket, CURLOPT_POST, 1) : CURL_SETOPT($socket, CURLOPT_POST, 0) );
( ( $method ) ? CURL_SETOPT($socket, CURLOPT_POSTFIELDS, $fields) : '' );
( ( $proxy ) ? CURL_SETOPT($socket, CURLOPT_PROXY, $proxy) : '' );
CURL_SETOPT($socket, CURLOPT_SSL_VERIFYPEER, 0);
CURL_SETOPT($socket, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)");
CURL_SETOPT($socket, CURLOPT_COOKIEJAR, "cookies.txt");
CURL_SETOPT($socket, CURLOPT_COOKIEFILE, "cookies.txt");
$page = curl_exec($socket);
curl_close( $socket );
return $page;
}
?>
1. URL
2. Die Methode -> 0 = GET, 1 = POST
3. Felder für ein Formular -> Array
4. lighttpd support -> eine 1 wenn es sich um lighttpd handelt
5. Proxy -> einfach einen Proxy eintragen z.B. '127.0.0.1:8001'
Um nun eine Seite zu scrapen, schreiben wir folgendes:
Code: Alles auswählen
<?php
$content = webbot( 'http://www.google.de' );
echo $content;
?>
Code: Alles auswählen
<?php
function webbot( $url, $method=0, $fields=0, $lighttpd=0, $proxy=0 )
{
$socket = curl_init();
CURL_SETOPT($socket, CURLOPT_URL, $url);
CURL_SETOPT($socket, CURLOPT_FOLLOWLOCATION, 2);
CURL_SETOPT($socket, CURLOPT_RETURNTRANSFER, 1);
( ( $lighttpd ) ? curl_setopt($socket, CURLOPT_HTTPHEADER, array("Expect:")) : '' );
( ( $method ) ? CURL_SETOPT($socket, CURLOPT_POST, 1) : CURL_SETOPT($socket, CURLOPT_POST, 0) );
( ( $method ) ? CURL_SETOPT($socket, CURLOPT_POSTFIELDS, $fields) : '' );
( ( $proxy ) ? CURL_SETOPT($socket, CURLOPT_PROXY, $proxy) : '' );
CURL_SETOPT($socket, CURLOPT_SSL_VERIFYPEER, 0);
CURL_SETOPT($socket, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)");
CURL_SETOPT($socket, CURLOPT_COOKIEJAR, "cookies.txt");
CURL_SETOPT($socket, CURLOPT_COOKIEFILE, "cookies.txt");
$page = curl_exec($socket);
curl_close( $socket );
return $page;
}
function login( $name, $pwd )
{
$data = webbot('http://www.yigg.de/login');
preg_match("/name=\"login\[_csrf_token\]\" value=\"(.*)\" id=\"login__csrf_token\"/", $data, $found);
$fields['login[_csrf_token]'] = $found[1];
$fields['login[username]'] = $name;
$fields['login[password]'] = $pwd;
$fields['login[remember]'] = 'on';
$fields['commit'] = 'Anmelden';
$url = 'http://www.yigg.de/login';
$data = webbot( $url, 1, $fields, 1 );
return $data;
}
$content = login( 'username', 'password');
echo $content;
?>