1 #include <string.h>
  2 #include <stdio.h>
  3 #include <limits.h>
  4 #include "rh_string.h"
  5 #include "uri.h"
  6 
  7 #define MAP_WANT_XDIGIT                 MAP_XDIGIT
  8 #define MAP_WANT_HTTP_HOST              MAP_HOST
  9 #define MAP_WANT_HTTP_URI               MAP_URI
 10 #define MAP_WANT_HTTP_URI_DECODED       MAP_URI_DECODED
 11 #include "rh_charmap.h"
 12 
 13 /*
 14  *
 15  * how the uri decoder works:
 16  *
 17  *
 18  * 1. test the whole raw uri against illegal characters
 19  *
 20  * 2. hex decode uri (if in 1. a % sign was found)
 21  * 
 22  * 3. validate decoded and non query characters (%00 may be used in ?query)
 23  * 
 24  * 4. split uri in its parts (user, pass, host, query, etc.)
 25  *
 26  * 5. validate/lowercase host:port, if any given
 27  *
 28  */
 29 
 30 static const
 31 struct uri_scheme {
 32         int             scheme;
 33         const char      *name;
 34         unsigned short  length;
 35         unsigned short  port;
 36 } uri_scheme[] = {
 37         { URI_SCHEME_UNSET, CONST_STR_LEN(""),           0 },
 38         { URI_SCHEME_HTTP , CONST_STR_LEN("http://"),   80 },
 39         { URI_SCHEME_HTTPS, CONST_STR_LEN("https://"), 443 },
 40         { URI_SCHEME_FTP  , CONST_STR_LEN("ftp://"),    21 },
 41         { -1, NULL, }
 42 };
 43 
 44 static int uri_decode_hex (rh_buffer_t *buffer, const char *src, size_t nsrc);
 45 
 46 static int uri_decode_strange_path (uri_t *uri, const char *src, size_t nsrc);
 47 
 48 static int uri_decode (uri_t *uri, const char *src, size_t nsrc);
 49 
 50 
 51 void uri_init (uri_t *uri)
 52 {
 53         memset(uri, 0, sizeof(*uri));
 54 }
 55 
 56 void uri_destroy (uri_t *uri)
 57 {
 58         rh_buffer_destroy (&uri->raw);
 59         rh_buffer_destroy (&uri->log);
 60         rh_buffer_destroy (&uri->decoded);
 61         rh_buffer_destroy (&uri->user);
 62         rh_buffer_destroy (&uri->pass);
 63         rh_buffer_destroy (&uri->path);
 64         rh_buffer_destroy (&uri->query);
 65         rh_buffer_destroy (&uri->fragment);
 66         rh_buffer_destroy (&uri->filename);
 67 
 68         http_host_destroy (&uri->host);
 69 }
 70 
 71 void uri_reset (uri_t *uri)
 72 {
 73         uri_destroy (uri);
 74         uri_init    (uri);
 75 }
 76 
 77 int uri_parse (uri_t *uri, const char *src, size_t nsrc)
 78 {
 79         size_t  offset;
 80         int     decode_hex;
 81         
 82         if (rh_buffer_append (&uri->raw, src, nsrc))
 83                 goto error_out;
 84 
 85         decode_hex = 0;
 86 
 87         /* illegal character test, without hex decoding */
 88         for (offset=0; offset<nsrc; ++offset) {
 89                 switch ( MAP_URI[(unsigned char)src[offset]] ) {
 90                 case 0:
 91                         goto error_out;
 92                 case '%':
 93                         decode_hex = 1;
 94                 }
 95         }
 96 
 97         if (0 == decode_hex) {
 98                 rh_buffer_set_const(&uri->decoded,uri->raw.data,uri->raw.used);
 99         } else {
100                 /* 
101                  * decode hex
102                  * validate decoded non query characters against MAP_URI_DECODED
103                  * 
104                  * (" " is invalid in the uri, but its valid as hex %20)
105                  *
106                  */
107                 if (0 != uri_decode_hex (&uri->decoded,
108                                         uri->raw.data, uri->raw.used))
109                 {
110                         goto error_out;
111                 }
112         }
113 
114         /* 
115          * seperate uri parts
116          * validate/lowercase host:port 
117          *
118          */
119         if (0 != uri_decode (uri, uri->decoded.data, uri->decoded.used)) {
120                 goto error_out;
121         }
122 
123         /* decoding was successfully, its safe to use the raw uri
124          * for logging since all characters were valid */
125         rh_buffer_set_const(&uri->log, uri->raw.data, uri->raw.used);
126 
127 #if 0
128         printf ("uri_decode: [%.*s]\n",
129                         uri->decoded.used,
130                         uri->decoded.data );
131 
132         uri_dump (uri);
133 #endif
134 
135         return 0;
136 
137 error_out:
138         
139         /* 
140          * XXX
141          *
142          * build uri->log for logging by
143          * encoding invalid characters with \NN (NN==hex)
144          * 
145          */
146 
147         uri->log.used = uri->log.size = 0;
148 
149         return -1;
150 }
151 
152 
153 void uri_dump (uri_t *uri)
154 {
155         printf ("%s(%p)\n", __FUNCTION__, (void*)uri);
156 
157         printf ("  filename : [%3u:%.*s]\n",
158                         uri->filename.used,
159                         uri->filename.used,
160                         uri->filename.data);
161         
162         printf ("  user     : [%3u:%.*s]\n",
163                         uri->user.used,
164                         uri->user.used,
165                         uri->user.data);
166         
167         printf ("  pass     : [%3u:%.*s]\n",
168                         uri->pass.used,
169                         uri->pass.used,
170                         uri->pass.data);
171         
172         printf ("  host     : [%3u:%.*s]\n",
173                         uri->host.host.used,
174                         uri->host.host.used,
175                         uri->host.host.data);
176         
177         printf ("  port     : [   :%d]\n",
178                         uri->host.port);
179         
180         printf ("  path     : [%3u:%.*s]\n",
181                         uri->path.used,
182                         uri->path.used,
183                         uri->path.data);
184         
185         printf ("  query    : [%3u:%.*s]\n",
186                         uri->query.used,
187                         uri->query.used,
188                         uri->query.data);
189         
190         printf ("  fragment : [%3u:%.*s]\n",
191                         uri->fragment.used,
192                         uri->fragment.used,
193                         uri->fragment.data);
194 }
195 
196 /*
197  * get the scheme of a proxy uri
198  *
199  */
200 static const struct uri_scheme * uri_get_scheme (const char *src, size_t nsrc)
201 {
202         /* 01234567
203          * 
204          * https://
205          * http://
206          * ftp://
207          *
208          *     s
209          *     :
210          *     /
211          *   
212          *     ^-------> uniq characters, only one may differ in case
213          */
214 
215         if (nsrc < CONST_LEN("ftp://"))
216                 return NULL;
217 
218         switch ( ((unsigned char*)src)[4] ) {
219         case ':':
220                 /* http  */
221 
222 
223                 /* 
224                  * 1234567      increment
225                  * 0123456      offset
226                  * 
227                  * http://
228                  *    1234
229                  *
230                  */
231                 
232                 if (!RH_EQUAL2("//", src+5))
233                         return NULL;
234                 
235                 /* fast try */
236                 if (!RH_EQUAL4("http", src)) {
237                         if (src[0] != 'h' && src[0] != 'H') return NULL;
238                         if (src[1] != 't' && src[1] != 'T') return NULL;
239                         if (src[2] != 't' && src[2] != 'T') return NULL;
240                         if (src[3] != 'p' && src[3] != 'P') return NULL;
241                 }
242                 
243                 return &uri_scheme[URI_SCHEME_HTTP];
244         
245         case 's':
246         case 'S':
247                 /* https */
248                 return NULL; /* atm not supported */
249                 break;
250         
251         
252         case '/':
253                 /* ftp   */
254                 return NULL; /* atm not supported */
255                 break;
256         }
257 
258         return NULL;
259 }
260 
261         
262 /*
263  *
264  * parses a uri into its components
265  * 
266  * parses "strange" path's (eg: "/dir/../")
267  *
268  * ---- WARNING --------------------------
269  * >
270  * > src has to be hex decoded.
271  * >
272  * ---------------------------------------
273  * 
274  * uri is of the form:
275  * 
276  *   [schema://[user[:pass]@]host[:port]]/uri[?query][#fragment]
277  *
278  */
279 static int uri_decode (uri_t *uri, const char *src, size_t nsrc)
280 {
281         int             state;          /* current state */
282         int             state_mask;     /* current state mask */
283         unsigned char   last_ch;        /* last character */
284         int             strange_path;   /* true if the uri has a strange path
285                                            (eg. "/../")
286                                            which requires revalidation. */
287         rh_buffer_t     *dest;          /* into which buffer should written */
288         rh_buffer_t     host = {0, };   /* temp. used host/port buffer */
289         rh_buffer_t     port = {0, };
290 
291         
292 
293         /* uri has to be at least one character ("/") long */
294         if (!nsrc)
295                 return -1;
296 
297         /* macro to simplify to set the current destination buffer */
298 #define DEST(_dest)                                     \
299         {       dest = (_dest);                         \
300                 dest->flags = RH_BUFFER_CONST;          \
301                 dest->data  = (char*)src;               \
302                 dest->used  = 0;                        \
303         }
304 
305         /* macro to simplify inc/dec-rementing src */
306 #define IGN_CHAR()      { ++src; --nsrc; }
307         
308         /* macro to simplify "using the current character for dest" */
309 #define USE_CHAR()      { ++dest->used;  }
310 
311         if ('/' == src[0]) {
312                 state_mask = state = URI_DECODE_STATE_PATH;
313                 DEST(&uri->path);
314         } else {
315                 const struct uri_scheme * scheme;
316 
317                 scheme = uri_get_scheme (src, nsrc);
318                 if (NULL == scheme)
319                         return -1;
320 
321                 src  += scheme->length;
322                 nsrc -= scheme->length;
323                 
324                 state_mask = state = URI_DECODE_STATE_USER;
325                 DEST(&uri->user);
326         }
327 #if 0
328         printf ("%s(): uri(%2d:%.*s)\n", __FUNCTION__, nsrc, nsrc , src );
329 #endif
330 
331         last_ch = 0;
332         strange_path = 0;
333 
334         while (nsrc) {
335                 unsigned char ch;
336                 
337                 ch = src[0];
338 
339                 /* 
340                  * test for strange path stuff ("/../" and friends)
341                  * 
342                  *
343                  */
344                 if (URI_DECODE_STATE_PATH == state) {
345                         switch (ch) {
346                         case '/':
347                         case '.':
348                                 switch (last_ch) {
349                                 case '/':       /* "/.", "//" */
350                                 case '.':       /* "..", "./" */
351                                         /* this is only set to true
352                                          * never to false */
353                                         strange_path = 1;
354                                 }
355                         }
356                         last_ch = ch;
357                 }
358 
359                 /*
360                  *
361                  * parse
362                  *
363                  */
364 
365                 switch (ch) {
366                 case 0:
367                         /* illegal character */
368                         return -1;
369                 
370                 case ':':
371                         /* 
372                          * user:pass
373                          * host:port
374                          *
375                          */
376                         if (state < URI_DECODE_STATE_PASS) {
377                                 state_mask |= (state = URI_DECODE_STATE_PASS);
378                                 
379                                 IGN_CHAR ();
380                                 DEST(&uri->pass);
381                                 continue;
382                         } else
383                         if (    state < URI_DECODE_STATE_PORT &&
384                                 state > URI_DECODE_STATE_PASS)
385                         {
386                                 state_mask |= (state = URI_DECODE_STATE_PORT);
387                                 IGN_CHAR ();
388                                 DEST(&port);
389                                 continue;
390                         } else
391                         if (state < URI_DECODE_STATE_PATH) {
392                                 /* ":" only allowed in correct state
393                                  * or after the first path "/" */
394                                 return -1;
395                         }
396                         break;
397 
398                 case '@':
399                         /* 
400                          * user:pass@host:port
401                          *      user@host
402                          *
403                          */
404                         
405                         if (state < URI_DECODE_STATE_HOST) {
406                                 state_mask |= (state = URI_DECODE_STATE_HOST);
407                                 IGN_CHAR ();
408                                 DEST(&host);
409                                 continue;
410                         } else
411                         if (state < URI_DECODE_STATE_PATH) {
412                                 /* "@" only allowed in correct state
413                                  * or after the first path "/" */
414                                 return -1;
415                         }
416 
417                         break;
418 
419                 case '/':
420                         /* /uri-path */
421                         
422                         if (state < URI_DECODE_STATE_PATH) {
423                                 state_mask |= (state = URI_DECODE_STATE_PATH);
424                                 DEST(&uri->path);
425                         }
426                         break;
427                 
428                 case '?':
429                         /* ?cgiquery */
430                         if (state < URI_DECODE_STATE_PATH)
431                                 return -1;
432                         
433                         if (state < URI_DECODE_STATE_QUERY) {
434                                 state_mask |= (state = URI_DECODE_STATE_QUERY);
435                                 IGN_CHAR ();
436                                 DEST(&uri->query);
437                                 continue;
438                         }
439                         break;
440                 
441                 case '#':
442                         /* #fragment */
443                         
444                         if (state < URI_DECODE_STATE_PATH)
445                                 return -1;
446 
447                         if (state < URI_DECODE_STATE_FRAGMENT) {
448                                 state_mask |= (state=URI_DECODE_STATE_FRAGMENT);
449                                 IGN_CHAR ();
450                                 DEST(&uri->fragment);
451                                 continue;
452                         }
453                         break;
454                 }
455                                 
456                 USE_CHAR ();
457                 IGN_CHAR ();
458         }
459         
460         if (!state_mask & URI_DECODE_STATE_PATH || 0 == uri->path.used)
461                 return -1;
462         
463         if (strange_path) {
464                 if (0 != uri_decode_strange_path (uri,
465                                         uri->path.data, uri->path.used))
466                 {
467                         return -1;
468                 }
469         }
470 
471         /* do something more if user/pass/host/port was given */
472         if (state_mask & (URI_DECODE_STATE_USER | URI_DECODE_STATE_PASS |
473                           URI_DECODE_STATE_HOST | URI_DECODE_STATE_PORT) )
474         {
475                 if (0 == (state_mask & (URI_DECODE_STATE_HOST |
476                                         URI_DECODE_STATE_PORT)) )
477                 {
478                         /* only host[:port] was given but this was
479                          * "wrongly" stored in user:pass, correct that. */
480 
481                         host = uri->user;
482                         port = uri->pass;
483 
484                         rh_buffer_destroy (&uri->user);
485                         rh_buffer_destroy (&uri->pass);
486 
487                         if (state_mask & URI_DECODE_STATE_USER) {
488                                 state_mask &= ~URI_DECODE_STATE_USER;
489                                 state_mask |=  URI_DECODE_STATE_HOST;
490                         }
491                         
492                         if (state_mask & URI_DECODE_STATE_PASS) {
493                                 state_mask &= ~URI_DECODE_STATE_PASS;
494                                 state_mask |=  URI_DECODE_STATE_PORT;
495                         }
496                 }
497                         
498                 if (state_mask & URI_DECODE_STATE_USER && 0 == uri->user.used)
499                         return -1;
500                 
501                 if (state_mask & URI_DECODE_STATE_PASS && 0 == uri->pass.used)
502                         return -1;
503         
504                 /* validate/lowercase host:port */
505                 if (state_mask & URI_DECODE_STATE_HOST) {
506                         size_t  nhost;
507                 
508                         /* 
509                          * this uses the whole constant buffer from
510                          * host to port: "host:12345"
511                          *
512                          */
513                         nhost = host.used;
514                         
515                         if (state_mask & URI_DECODE_STATE_PORT) {
516                                 if (0 == port.used) 
517                                         return -1;
518 
519                                 nhost += CONST_LEN(":") + port.used;
520                         }
521                 
522                         /* parse host:port */
523                         if (0 != http_host_parse(&uri->host, host.data, nhost))
524                                 return -1;
525                 }
526         }
527 
528         uri->decode_state = state_mask;
529                         
530         return 0;
531 
532 #undef IGN_CHAR
533 #undef USE_CHAR
534 #undef DEST
535 }
536 
537 
538 /*
539  *
540  * removes strange paths parts from the uri->path component.
541  *
542  * if something is wrong (eg: "/dir/../../../whatever") this functions
543  * returns -1
544  * 
545  *
546  * XXX: Win32: "/\..\/"  (or something like that)
547  *
548  */
549 
550 static int uri_decode_strange_path (uri_t *uri, const char *src, size_t nsrc)
551 {
552         char    *dst;
553         size_t  ndst;
554         
555         /* remove: "//", "/./", "/../"
556          *  valid: "/.test.html" "/dir/../file.html"
557          * 
558          *
559          */
560 
561         dst  = uri->path.data;
562         ndst = 0;
563         
564         while (nsrc) {
565                 if ('/' == src[0]) {
566                         /* remove ALL following "/" */
567                         for (;nsrc>1 && src[1] == '/';++src,--nsrc);
568 #if 0
569                         printf ("%s(): src[%.*s]\n", __FUNCTION__,
570                                         nsrc, src);
571 #endif
572 
573                         /* "/./" */
574                         if (nsrc > 2 && RH_EQUAL2(src+1,"./")) {
575                                 src  += 2;
576                                 nsrc -= 2;
577                                 continue;
578                         }
579                         
580                         /* "/../" */
581                         if (    nsrc > 3 &&
582                                 src[1] == '.' &&
583                                 src[2] == '.' &&
584                                 src[3] == '/')
585                         {
586                                 
587                                 /* backward scan for previous dir */
588                                 for (;;) {
589                                         /* running outside the root? */
590                                         if (0 == ndst)
591                                                 return -1;
592                                         
593                                         --ndst;
594                                         
595                                         /* got a previous path? */
596                                         if ('/' == uri->path.data[ndst])
597                                                 break;
598                                 }
599 
600                                 dst = uri->path.data + ndst;
601                         
602                                 src  += 3;
603                                 nsrc -= 3;
604                                 
605                                 continue;
606                         }
607                 }
608 
609                 /* copy src into dst */
610                 *dst++ = *src++;
611                 
612                 /* correct src/dst size */
613                 ++ndst;
614                 --nsrc;
615         }
616 
617         uri->path.used = ndst;
618 
619         return 0;
620 }
621 
622 /*
623  *
624  * decode a hex encoded uri to (including) '?'.
625  *
626  * after the '?' the characters validated against invalid
627  * hex encodings, eg: 0xXY 
628  *
629  * if a '#' is found then the fragment is hex decoded.
630  *
631  * decoded characters are tested against MAP_URI_DECODED if its not
632  * the query part.
633  *
634  */
635 
636 static int uri_decode_hex (rh_buffer_t *buffer, const char *src, size_t nsrc)
637 {
638         size_t  space_left;
639         int     decode;
640 
641         if (rh_buffer_reserve (buffer, nsrc + 32))
642                 return -1;
643 
644         decode = 1;
645         
646         if (0 == nsrc)
647                 return 0;
648 
649         for (;;) {
650                 for (   space_left = rh_buffer_size(buffer);
651                         nsrc && space_left > 2;
652                         --space_left)
653                 {
654                         unsigned char   ch, ch_decoded;
655                         
656                         ch = src[0];
657 
658                         if ('%' != ch) {
659                                 if ('?' == ch)
660                                         decode = 0;
661                                 if ('#' == ch)
662                                         decode = 1;
663                         
664                                 buffer->data[buffer->used++] = ch;
665                                 
666                                 ++src;
667                                 --nsrc;
668                         } else {
669                                 unsigned char   hex1, hex2;
670 
671                                 if (nsrc < 2)
672                                         return -1;
673 
674                                 hex1 = MAP_XDIGIT[(unsigned char)(src[1])];
675                                 hex2 = MAP_XDIGIT[(unsigned char)(src[2])];
676 
677                                 /* illegal hex encoding, eg: 0xQA */
678                                 if (0xff == hex1 || 0xff == hex2)
679                                         return -1;
680 
681                                 if (!decode) {
682                                         buffer->data[buffer->used++] = '%';
683                                         buffer->data[buffer->used++] = src[1];
684                                         buffer->data[buffer->used++] = src[2];
685                                 } else {
686                                         ch_decoded = hex1 * 16 + hex2;
687 
688                                         /* test if decoded character is valid */
689 
690                                         if (0 == MAP_URI_DECODED[ch_decoded])
691                                                 return -1;
692                                         
693                                         buffer->data[buffer->used++] =
694                                                 ch_decoded;
695                                 
696                                         if ('?' == ch_decoded)
697                                                 decode = 0;
698                                         
699                                         if ('#' == ch_decoded)
700                                                 decode = 1;
701 
702                                 }
703                                 src  += 3;
704                                 nsrc -= 3;              
705                         }
706                 }
707 
708                 if (0 == nsrc)
709                         return 0;
710 
711                 if (rh_buffer_reserve (buffer, nsrc + 32))
712                         return -1;
713         }
714 
715         /* never reached */
716 }


syntax highlighted by Code2HTML, v. 0.9.1