#! /usr/bin/php 'zipcode', 'address' => 'localhost', 'port' => 11211 ); //Mysql configuration $mycfg = array( 'host' => 'localhost', 'username' => 'root', 'passwd' => '', 'dbname' => 'zipcode' ); //Ctx configuration $headers = array( 'Connection: close', 'User-agent: Laposte/0.1' ); //Create mysqli object $mysqli = new mysqli($mycfg['host'], $mycfg['username'], $mycfg['passwd'], $mycfg['dbname']); //Set to restart on last zipcode in database //XXX: use to restart from last zipcode in case of memcache reset since last run if ($zipcode = $mysqli->query('SELECT MAX(zipcode) AS zipcode FROM zipcode')) { if (($row = $zipcode->fetch_row()) && !empty($row[0])) { $first = $row[0] + 1; } } //Search url $url = 'https://www.laposte.fr/particulier/outils/trouver-un-code-postal'; //Tidy config $config = array( //mostly useless in fact 'indent' => true, //Required to simplify simplexml transition 'output-xml' => true, //Required to avoid xml errors 'quote-nbsp' => false, //Required to fix code 'clean' => true ); //Configure memcached $mem = new Memcached($memcfg['id']); $mem->setOption(Memcached::OPT_LIBKETAMA_COMPATIBLE, true); $mem->addServer($memcfg['address'], $memcfg['port']); //Tidy object $tidy = new tidy(); //Find out if we have ezxform_token in cache if (($ezxform_token = $mem->get(MEMPFX.'ezxform_token')) === false || ($cookies = $mem->get(MEMPFX.'cookies')) === false) { //Set fake context $ctx = stream_context_create( array( 'http' => array( 'method' => 'GET', 'max_redirects' => 0, 'ignore_errors' => 0, 'header' => $headers+array( ) ) ) ); //Load the page for fetching VilleCP[ezxform_token] if (($data = file_get_contents($url, false, $ctx)) !== false) { //Parse string $tidy->parseString($data, $config, 'utf8'); //Fix error buffer //XXX: don't care about theses errors, tidy is here to fix... #if (!empty($tidy->errorBuffer)) { # var_dump($tidy->errorBuffer); # die('Tidy errors'); #} //Load simplexml $sx = new SimpleXMLElement($tidy); //Set ezxform_token in memcache $mem->set(MEMPFX.'ezxform_token', $ezxform_token = (string)$sx->xpath('//input[@id="VilleCP_ezxform_token"]')[0]->attributes()->value, time()+60); //Cleanup unset($sx); //Handle cookies $cookies = array(); foreach($http_response_header as $header) { //Add cookie if (preg_match('/^Set-Cookie:\s+([^;]+)/', $header, $matches)) { parse_str($matches[1], $tmp); $cookies += $tmp; } //Handle cookie reset if (preg_match('/^Set-Cookie:\s+([^;]+).*expires=([^;]+)/', $header, $matches) && strtotime($matches[2]) < 100000) { parse_str($matches[1], $tmp); foreach($tmp as $key => $null) { unset($cookies[$key]); } } } //Set cookies in memcache $mem->set(MEMPFX.'cookies', $cookies, time()+60); //Failed get content } else { die($_SERVER['PHP_SELF'].': file_get_contents(ezxform_token) failed: '.$url); } } //Find out if we have last code in memcache if (($last = $mem->get(MEMPFX.'last')) === false) { //Set last in memcache $mem->set(MEMPFX.'last', ($last = $first), time()+3600*24); } //Create insert stmt if (!($stmt = $mysqli->prepare('INSERT IGNORE INTO zipcode (zipcode, city) VALUES (?, ?)'))) { die('Couldn\'t prepare insert'); } //Loop on all possible postal code //TODO: see if we need to validate in extra all insee code with letters for($i = $last; $i <= 10000; $i++) { //Build data $pdata = http_build_query( array( 'VilleCP[filtre]' => 'ville', 'VilleCP[communeCode]' => sprintf('%05d', $i), 'VilleCP[save]' => '', 'VilleCP[ezxform_token]' => $ezxform_token ) ); echo $pdata.': '; //Build cookie $cookie = array(); foreach($cookies as $key => $value) { $cookie[] = urlencode($key).'='.urlencode($value); } $cookie = implode('; ', $cookie); //Set post context $pctx = stream_context_create( array( 'http' => array( 'method' => 'POST', 'max_redirects' => 0, 'ignore_errors' => 0, 'header' => $headers+array( 'Content-type: application/x-www-form-urlencoded', 'Content-Length: '.strlen($pdata), 'Cookie: '.$cookie ), 'content' => $pdata ) ) ); //Load the page for fetching VilleCP[ezxform_token] if (($data = file_get_contents($url, false, $pctx)) !== false) { //Parse string $tidy->parseString($data, $config, 'utf8'); //Fix error buffer //XXX: don't care about theses errors, tidy is here to fix... #if (!empty($tidy->errorBuffer)) { # var_dump($tidy->errorBuffer); # die('Tidy errors'); #} //Load simplexml $sx = new SimpleXMLElement($tidy); //Extract and store ezxform_token in memcache $mem->set(MEMPFX.'ezxform_token', $ezxform_token = (string)$sx->xpath('//input[@id="VilleCP_ezxform_token"]')[0]->attributes()->value, time()+30); //Handle cookies foreach($http_response_header as $header) { //Handle cookie reset if (preg_match('/^Set-Cookie:\s+([^;]+).*expires=([^;]+)/', $header, $matches) && strtotime($matches[2]) < 100000) { parse_str($matches[1], $tmp); foreach($tmp as $key => $null) { unset($cookies[$key]); } //Add cookie } elseif (preg_match('/^Set-Cookie:\s+([^;]+)/', $header, $matches)) { parse_str($matches[1], $tmp); $cookies += $tmp; } } //Set cookies in memcache $mem->set(MEMPFX.'cookies', $cookies, time()+60); //Extract cities foreach($sx->xpath('/html/body/div[@id="app"][1]/div[9]/div[3]/div/div/div/div/table/tbody/tr') as $line) { $zipcode = trim($line->td[0]); $city = trim($line->td[1]); if (!$stmt->bind_param('ss', $zipcode, $city)) { die('Couldn\'t bind params'); } if (!$stmt->execute()) { die('Couldn\'t execute'); } } //Failed get content } else { die($_SERVER['PHP_SELF'].': file_get_contents('.$pdata.') failed: '.$url); } echo 'OK'."\n"; //Set last in memcache $mem->set(MEMPFX.'last', $i, time()+3600*24); } //Close insert request $stmt->close(); //Close mysqli connection $mysqli->close();