]> Raphaƫl G. Git Repositories - zipcode/blob - laposte
Import laposte script with first set of data
[zipcode] / laposte
1 #! /usr/bin/php
2 <?php
3
4 //Force timezone
5 date_default_timezone_set('UTC');
6
7 //Memcache prefix
8 define('MEMPFX', 'zipcode_');
9
10 //Set first
11 $first = 0;
12
13 //Memcached configuration
14 $memcfg = array(
15 'id' => 'zipcode',
16 'address' => 'localhost',
17 'port' => 11211
18 );
19
20 //Mysql configuration
21 $mycfg = array(
22 'host' => 'localhost',
23 'username' => 'root',
24 'passwd' => '',
25 'dbname' => 'zipcode'
26 );
27
28 //Ctx configuration
29 $headers = array(
30 'Connection: close',
31 'User-agent: Laposte/0.1'
32 );
33
34 //Create mysqli object
35 $mysqli = new mysqli($mycfg['host'], $mycfg['username'], $mycfg['passwd'], $mycfg['dbname']);
36
37 //Set to restart on last zipcode in database
38 //XXX: use to restart from last zipcode in case of memcache reset since last run
39 if ($zipcode = $mysqli->query('SELECT MAX(zipcode) AS zipcode FROM zipcode')) {
40 if (($row = $zipcode->fetch_row()) && !empty($row[0])) {
41 $first = $row[0] + 1;
42 }
43 }
44
45 //Search url
46 $url = 'https://www.laposte.fr/particulier/outils/trouver-un-code-postal';
47
48 //Tidy config
49 $config = array(
50 //mostly useless in fact
51 'indent' => true,
52 //Required to simplify simplexml transition
53 'output-xml' => true,
54 //Required to avoid xml errors
55 'quote-nbsp' => false,
56 //Required to fix code
57 'clean' => true
58 );
59
60 //Configure memcached
61 $mem = new Memcached($memcfg['id']);
62 $mem->setOption(Memcached::OPT_LIBKETAMA_COMPATIBLE, true);
63 $mem->addServer($memcfg['address'], $memcfg['port']);
64
65 //Tidy object
66 $tidy = new tidy();
67
68 //Find out if we have ezxform_token in cache
69 if (($ezxform_token = $mem->get(MEMPFX.'ezxform_token')) === false || ($cookies = $mem->get(MEMPFX.'cookies')) === false) {
70 //Set fake context
71 $ctx = stream_context_create(
72 array(
73 'http' => array(
74 'method' => 'GET',
75 'max_redirects' => 0,
76 'ignore_errors' => 0,
77 'header' => $headers+array(
78 )
79 )
80 )
81 );
82
83 //Load the page for fetching VilleCP[ezxform_token]
84 if (($data = file_get_contents($url, false, $ctx)) !== false) {
85 //Parse string
86 $tidy->parseString($data, $config, 'utf8');
87
88 //Fix error buffer
89 //XXX: don't care about theses errors, tidy is here to fix...
90 #if (!empty($tidy->errorBuffer)) {
91 # var_dump($tidy->errorBuffer);
92 # die('Tidy errors');
93 #}
94
95 //Load simplexml
96 $sx = new SimpleXMLElement($tidy);
97
98 //Set ezxform_token in memcache
99 $mem->set(MEMPFX.'ezxform_token', $ezxform_token = (string)$sx->xpath('//input[@id="VilleCP_ezxform_token"]')[0]->attributes()->value, time()+60);
100
101 //Cleanup
102 unset($sx);
103
104 //Handle cookies
105 $cookies = array();
106 foreach($http_response_header as $header) {
107 //Add cookie
108 if (preg_match('/^Set-Cookie:\s+([^;]+)/', $header, $matches)) {
109 parse_str($matches[1], $tmp);
110 $cookies += $tmp;
111 }
112 //Handle cookie reset
113 if (preg_match('/^Set-Cookie:\s+([^;]+).*expires=([^;]+)/', $header, $matches) && strtotime($matches[2]) < 100000) {
114 parse_str($matches[1], $tmp);
115 foreach($tmp as $key => $null) {
116 unset($cookies[$key]);
117 }
118 }
119 }
120
121 //Set cookies in memcache
122 $mem->set(MEMPFX.'cookies', $cookies, time()+60);
123 //Failed get content
124 } else {
125 die($_SERVER['PHP_SELF'].': file_get_contents(ezxform_token) failed: '.$url);
126 }
127 }
128
129 //Find out if we have last code in memcache
130 if (($last = $mem->get(MEMPFX.'last')) === false) {
131 //Set last in memcache
132 $mem->set(MEMPFX.'last', ($last = $first), time()+3600*24);
133 }
134
135 //Create insert stmt
136 if (!($stmt = $mysqli->prepare('INSERT IGNORE INTO zipcode (zipcode, city) VALUES (?, ?)'))) {
137 die('Couldn\'t prepare insert');
138 }
139
140 //Loop on all possible postal code
141 //TODO: see if we need to validate in extra all insee code with letters
142 for($i = $last; $i <= 10000; $i++) {
143 //Build data
144 $pdata = http_build_query(
145 array(
146 'VilleCP[filtre]' => 'ville',
147 'VilleCP[communeCode]' => sprintf('%05d', $i),
148 'VilleCP[save]' => '',
149 'VilleCP[ezxform_token]' => $ezxform_token
150 )
151 );
152
153 echo $pdata.': ';
154
155 //Build cookie
156 $cookie = array();
157 foreach($cookies as $key => $value) {
158 $cookie[] = urlencode($key).'='.urlencode($value);
159 }
160 $cookie = implode('; ', $cookie);
161
162 //Set post context
163 $pctx = stream_context_create(
164 array(
165 'http' => array(
166 'method' => 'POST',
167 'max_redirects' => 0,
168 'ignore_errors' => 0,
169 'header' => $headers+array(
170 'Content-type: application/x-www-form-urlencoded',
171 'Content-Length: '.strlen($pdata),
172 'Cookie: '.$cookie
173 ),
174 'content' => $pdata
175 )
176 )
177 );
178
179 //Load the page for fetching VilleCP[ezxform_token]
180 if (($data = file_get_contents($url, false, $pctx)) !== false) {
181 //Parse string
182 $tidy->parseString($data, $config, 'utf8');
183
184 //Fix error buffer
185 //XXX: don't care about theses errors, tidy is here to fix...
186 #if (!empty($tidy->errorBuffer)) {
187 # var_dump($tidy->errorBuffer);
188 # die('Tidy errors');
189 #}
190
191 //Load simplexml
192 $sx = new SimpleXMLElement($tidy);
193
194 //Extract and store ezxform_token in memcache
195 $mem->set(MEMPFX.'ezxform_token', $ezxform_token = (string)$sx->xpath('//input[@id="VilleCP_ezxform_token"]')[0]->attributes()->value, time()+30);
196
197 //Handle cookies
198 foreach($http_response_header as $header) {
199 //Handle cookie reset
200 if (preg_match('/^Set-Cookie:\s+([^;]+).*expires=([^;]+)/', $header, $matches) && strtotime($matches[2]) < 100000) {
201 parse_str($matches[1], $tmp);
202 foreach($tmp as $key => $null) {
203 unset($cookies[$key]);
204 }
205 //Add cookie
206 } elseif (preg_match('/^Set-Cookie:\s+([^;]+)/', $header, $matches)) {
207 parse_str($matches[1], $tmp);
208 $cookies += $tmp;
209 }
210 }
211
212 //Set cookies in memcache
213 $mem->set(MEMPFX.'cookies', $cookies, time()+60);
214
215 //Extract cities
216 foreach($sx->xpath('/html/body/div[@id="app"][1]/div[9]/div[3]/div/div/div/div/table/tbody/tr') as $line) {
217 $zipcode = trim($line->td[0]);
218 $city = trim($line->td[1]);
219 if (!$stmt->bind_param('ss', $zipcode, $city)) {
220 die('Couldn\'t bind params');
221 }
222 if (!$stmt->execute()) {
223 die('Couldn\'t execute');
224 }
225 }
226 //Failed get content
227 } else {
228 die($_SERVER['PHP_SELF'].': file_get_contents('.$pdata.') failed: '.$url);
229 }
230
231 echo 'OK'."\n";
232
233 //Set last in memcache
234 $mem->set(MEMPFX.'last', $i, time()+3600*24);
235 }
236
237 //Close insert request
238 $stmt->close();
239
240 //Close mysqli connection
241 $mysqli->close();