↑ 1 #include <string.h>
↑ 2 #include <stdio.h>
↑ 3 #include <limits.h>
↑ 4 #include "rh_string.h"
↑ 5 #include "uri.h"
↑ 6
↑ 7 #define MAP_WANT_XDIGIT MAP_XDIGIT
↑ 8 #define MAP_WANT_HTTP_HOST MAP_HOST
↑ 9 #define MAP_WANT_HTTP_URI MAP_URI
↑ 10 #define MAP_WANT_HTTP_URI_DECODED MAP_URI_DECODED
↑ 11 #include "rh_charmap.h"
↑ 12
↑ 13 /*
↑ 14 *
↑ 15 * how the uri decoder works:
↑ 16 *
↑ 17 *
↑ 18 * 1. test the whole raw uri against illegal characters
↑ 19 *
↑ 20 * 2. hex decode uri (if in 1. a % sign was found)
↑ 21 *
↑ 22 * 3. validate decoded and non query characters (%00 may be used in ?query)
↑ 23 *
↑ 24 * 4. split uri in its parts (user, pass, host, query, etc.)
↑ 25 *
↑ 26 * 5. validate/lowercase host:port, if any given
↑ 27 *
↑ 28 */
↑ 29
↑ 30 static const
↑ 31 struct uri_scheme {
↑ 32 int scheme;
↑ 33 const char *name;
↑ 34 unsigned short length;
↑ 35 unsigned short port;
↑ 36 } uri_scheme[] = {
↑ 37 { URI_SCHEME_UNSET, CONST_STR_LEN(""), 0 },
↑ 38 { URI_SCHEME_HTTP , CONST_STR_LEN("http://"), 80 },
↑ 39 { URI_SCHEME_HTTPS, CONST_STR_LEN("https://"), 443 },
↑ 40 { URI_SCHEME_FTP , CONST_STR_LEN("ftp://"), 21 },
↑ 41 { -1, NULL, }
↑ 42 };
↑ 43
↑ 44 static int uri_decode_hex (rh_buffer_t *buffer, const char *src, size_t nsrc);
↑ 45
↑ 46 static int uri_decode_strange_path (uri_t *uri, const char *src, size_t nsrc);
↑ 47
↑ 48 static int uri_decode (uri_t *uri, const char *src, size_t nsrc);
↑ 49
↑ 50
↑ 51 void uri_init (uri_t *uri)
↑ 52 {
↑ 53 memset(uri, 0, sizeof(*uri));
↑ 54 }
↑ 55
↑ 56 void uri_destroy (uri_t *uri)
↑ 57 {
↑ 58 rh_buffer_destroy (&uri->raw);
↑ 59 rh_buffer_destroy (&uri->log);
↑ 60 rh_buffer_destroy (&uri->decoded);
↑ 61 rh_buffer_destroy (&uri->user);
↑ 62 rh_buffer_destroy (&uri->pass);
↑ 63 rh_buffer_destroy (&uri->path);
↑ 64 rh_buffer_destroy (&uri->query);
↑ 65 rh_buffer_destroy (&uri->fragment);
↑ 66 rh_buffer_destroy (&uri->filename);
↑ 67
↑ 68 http_host_destroy (&uri->host);
↑ 69 }
↑ 70
↑ 71 void uri_reset (uri_t *uri)
↑ 72 {
↑ 73 uri_destroy (uri);
↑ 74 uri_init (uri);
↑ 75 }
↑ 76
↑ 77 int uri_parse (uri_t *uri, const char *src, size_t nsrc)
↑ 78 {
↑ 79 size_t offset;
↑ 80 int decode_hex;
↑ 81
↑ 82 if (rh_buffer_append (&uri->raw, src, nsrc))
↑ 83 goto error_out;
↑ 84
↑ 85 decode_hex = 0;
↑ 86
↑ 87 /* illegal character test, without hex decoding */
↑ 88 for (offset=0; offset<nsrc; ++offset) {
↑ 89 switch ( MAP_URI[(unsigned char)src[offset]] ) {
↑ 90 case 0:
↑ 91 goto error_out;
↑ 92 case '%':
↑ 93 decode_hex = 1;
↑ 94 }
↑ 95 }
↑ 96
↑ 97 if (0 == decode_hex) {
↑ 98 rh_buffer_set_const(&uri->decoded,uri->raw.data,uri->raw.used);
↑ 99 } else {
↑100 /*
↑101 * decode hex
↑102 * validate decoded non query characters against MAP_URI_DECODED
↑103 *
↑104 * (" " is invalid in the uri, but its valid as hex %20)
↑105 *
↑106 */
↑107 if (0 != uri_decode_hex (&uri->decoded,
↑108 uri->raw.data, uri->raw.used))
↑109 {
↑110 goto error_out;
↑111 }
↑112 }
↑113
↑114 /*
↑115 * seperate uri parts
↑116 * validate/lowercase host:port
↑117 *
↑118 */
↑119 if (0 != uri_decode (uri, uri->decoded.data, uri->decoded.used)) {
↑120 goto error_out;
↑121 }
↑122
↑123 /* decoding was successfully, its safe to use the raw uri
↑124 * for logging since all characters were valid */
↑125 rh_buffer_set_const(&uri->log, uri->raw.data, uri->raw.used);
↑126
↑127 #if 0
↑128 printf ("uri_decode: [%.*s]\n",
↑129 uri->decoded.used,
↑130 uri->decoded.data );
↑131
↑132 uri_dump (uri);
↑133 #endif
↑134
↑135 return 0;
↑136
↑137 error_out:
↑138
↑139 /*
↑140 * XXX
↑141 *
↑142 * build uri->log for logging by
↑143 * encoding invalid characters with \NN (NN==hex)
↑144 *
↑145 */
↑146
↑147 uri->log.used = uri->log.size = 0;
↑148
↑149 return -1;
↑150 }
↑151
↑152
↑153 void uri_dump (uri_t *uri)
↑154 {
↑155 printf ("%s(%p)\n", __FUNCTION__, (void*)uri);
↑156
↑157 printf (" filename : [%3u:%.*s]\n",
↑158 uri->filename.used,
↑159 uri->filename.used,
↑160 uri->filename.data);
↑161
↑162 printf (" user : [%3u:%.*s]\n",
↑163 uri->user.used,
↑164 uri->user.used,
↑165 uri->user.data);
↑166
↑167 printf (" pass : [%3u:%.*s]\n",
↑168 uri->pass.used,
↑169 uri->pass.used,
↑170 uri->pass.data);
↑171
↑172 printf (" host : [%3u:%.*s]\n",
↑173 uri->host.host.used,
↑174 uri->host.host.used,
↑175 uri->host.host.data);
↑176
↑177 printf (" port : [ :%d]\n",
↑178 uri->host.port);
↑179
↑180 printf (" path : [%3u:%.*s]\n",
↑181 uri->path.used,
↑182 uri->path.used,
↑183 uri->path.data);
↑184
↑185 printf (" query : [%3u:%.*s]\n",
↑186 uri->query.used,
↑187 uri->query.used,
↑188 uri->query.data);
↑189
↑190 printf (" fragment : [%3u:%.*s]\n",
↑191 uri->fragment.used,
↑192 uri->fragment.used,
↑193 uri->fragment.data);
↑194 }
↑195
↑196 /*
↑197 * get the scheme of a proxy uri
↑198 *
↑199 */
↑200 static const struct uri_scheme * uri_get_scheme (const char *src, size_t nsrc)
↑201 {
↑202 /* 01234567
↑203 *
↑204 * https://
↑205 * http://
↑206 * ftp://
↑207 *
↑208 * s
↑209 * :
↑210 * /
↑211 *
↑212 * ^-------> uniq characters, only one may differ in case
↑213 */
↑214
↑215 if (nsrc < CONST_LEN("ftp://"))
↑216 return NULL;
↑217
↑218 switch ( ((unsigned char*)src)[4] ) {
↑219 case ':':
↑220 /* http */
↑221
↑222
↑223 /*
↑224 * 1234567 increment
↑225 * 0123456 offset
↑226 *
↑227 * http://
↑228 * 1234
↑229 *
↑230 */
↑231
↑232 if (!RH_EQUAL2("//", src+5))
↑233 return NULL;
↑234
↑235 /* fast try */
↑236 if (!RH_EQUAL4("http", src)) {
↑237 if (src[0] != 'h' && src[0] != 'H') return NULL;
↑238 if (src[1] != 't' && src[1] != 'T') return NULL;
↑239 if (src[2] != 't' && src[2] != 'T') return NULL;
↑240 if (src[3] != 'p' && src[3] != 'P') return NULL;
↑241 }
↑242
↑243 return &uri_scheme[URI_SCHEME_HTTP];
↑244
↑245 case 's':
↑246 case 'S':
↑247 /* https */
↑248 return NULL; /* atm not supported */
↑249 break;
↑250
↑251
↑252 case '/':
↑253 /* ftp */
↑254 return NULL; /* atm not supported */
↑255 break;
↑256 }
↑257
↑258 return NULL;
↑259 }
↑260
↑261
↑262 /*
↑263 *
↑264 * parses a uri into its components
↑265 *
↑266 * parses "strange" path's (eg: "/dir/../")
↑267 *
↑268 * ---- WARNING --------------------------
↑269 * >
↑270 * > src has to be hex decoded.
↑271 * >
↑272 * ---------------------------------------
↑273 *
↑274 * uri is of the form:
↑275 *
↑276 * [schema://[user[:pass]@]host[:port]]/uri[?query][#fragment]
↑277 *
↑278 */
↑279 static int uri_decode (uri_t *uri, const char *src, size_t nsrc)
↑280 {
↑281 int state; /* current state */
↑282 int state_mask; /* current state mask */
↑283 unsigned char last_ch; /* last character */
↑284 int strange_path; /* true if the uri has a strange path
↑285 (eg. "/../")
↑286 which requires revalidation. */
↑287 rh_buffer_t *dest; /* into which buffer should written */
↑288 rh_buffer_t host = {0, }; /* temp. used host/port buffer */
↑289 rh_buffer_t port = {0, };
↑290
↑291
↑292
↑293 /* uri has to be at least one character ("/") long */
↑294 if (!nsrc)
↑295 return -1;
↑296
↑297 /* macro to simplify to set the current destination buffer */
↑298 #define DEST(_dest) \
↑299 { dest = (_dest); \
↑300 dest->flags = RH_BUFFER_CONST; \
↑301 dest->data = (char*)src; \
↑302 dest->used = 0; \
↑303 }
↑304
↑305 /* macro to simplify inc/dec-rementing src */
↑306 #define IGN_CHAR() { ++src; --nsrc; }
↑307
↑308 /* macro to simplify "using the current character for dest" */
↑309 #define USE_CHAR() { ++dest->used; }
↑310
↑311 if ('/' == src[0]) {
↑312 state_mask = state = URI_DECODE_STATE_PATH;
↑313 DEST(&uri->path);
↑314 } else {
↑315 const struct uri_scheme * scheme;
↑316
↑317 scheme = uri_get_scheme (src, nsrc);
↑318 if (NULL == scheme)
↑319 return -1;
↑320
↑321 src += scheme->length;
↑322 nsrc -= scheme->length;
↑323
↑324 state_mask = state = URI_DECODE_STATE_USER;
↑325 DEST(&uri->user);
↑326 }
↑327 #if 0
↑328 printf ("%s(): uri(%2d:%.*s)\n", __FUNCTION__, nsrc, nsrc , src );
↑329 #endif
↑330
↑331 last_ch = 0;
↑332 strange_path = 0;
↑333
↑334 while (nsrc) {
↑335 unsigned char ch;
↑336
↑337 ch = src[0];
↑338
↑339 /*
↑340 * test for strange path stuff ("/../" and friends)
↑341 *
↑342 *
↑343 */
↑344 if (URI_DECODE_STATE_PATH == state) {
↑345 switch (ch) {
↑346 case '/':
↑347 case '.':
↑348 switch (last_ch) {
↑349 case '/': /* "/.", "//" */
↑350 case '.': /* "..", "./" */
↑351 /* this is only set to true
↑352 * never to false */
↑353 strange_path = 1;
↑354 }
↑355 }
↑356 last_ch = ch;
↑357 }
↑358
↑359 /*
↑360 *
↑361 * parse
↑362 *
↑363 */
↑364
↑365 switch (ch) {
↑366 case 0:
↑367 /* illegal character */
↑368 return -1;
↑369
↑370 case ':':
↑371 /*
↑372 * user:pass
↑373 * host:port
↑374 *
↑375 */
↑376 if (state < URI_DECODE_STATE_PASS) {
↑377 state_mask |= (state = URI_DECODE_STATE_PASS);
↑378
↑379 IGN_CHAR ();
↑380 DEST(&uri->pass);
↑381 continue;
↑382 } else
↑383 if ( state < URI_DECODE_STATE_PORT &&
↑384 state > URI_DECODE_STATE_PASS)
↑385 {
↑386 state_mask |= (state = URI_DECODE_STATE_PORT);
↑387 IGN_CHAR ();
↑388 DEST(&port);
↑389 continue;
↑390 } else
↑391 if (state < URI_DECODE_STATE_PATH) {
↑392 /* ":" only allowed in correct state
↑393 * or after the first path "/" */
↑394 return -1;
↑395 }
↑396 break;
↑397
↑398 case '@':
↑399 /*
↑400 * user:pass@host:port
↑401 * user@host
↑402 *
↑403 */
↑404
↑405 if (state < URI_DECODE_STATE_HOST) {
↑406 state_mask |= (state = URI_DECODE_STATE_HOST);
↑407 IGN_CHAR ();
↑408 DEST(&host);
↑409 continue;
↑410 } else
↑411 if (state < URI_DECODE_STATE_PATH) {
↑412 /* "@" only allowed in correct state
↑413 * or after the first path "/" */
↑414 return -1;
↑415 }
↑416
↑417 break;
↑418
↑419 case '/':
↑420 /* /uri-path */
↑421
↑422 if (state < URI_DECODE_STATE_PATH) {
↑423 state_mask |= (state = URI_DECODE_STATE_PATH);
↑424 DEST(&uri->path);
↑425 }
↑426 break;
↑427
↑428 case '?':
↑429 /* ?cgiquery */
↑430 if (state < URI_DECODE_STATE_PATH)
↑431 return -1;
↑432
↑433 if (state < URI_DECODE_STATE_QUERY) {
↑434 state_mask |= (state = URI_DECODE_STATE_QUERY);
↑435 IGN_CHAR ();
↑436 DEST(&uri->query);
↑437 continue;
↑438 }
↑439 break;
↑440
↑441 case '#':
↑442 /* #fragment */
↑443
↑444 if (state < URI_DECODE_STATE_PATH)
↑445 return -1;
↑446
↑447 if (state < URI_DECODE_STATE_FRAGMENT) {
↑448 state_mask |= (state=URI_DECODE_STATE_FRAGMENT);
↑449 IGN_CHAR ();
↑450 DEST(&uri->fragment);
↑451 continue;
↑452 }
↑453 break;
↑454 }
↑455
↑456 USE_CHAR ();
↑457 IGN_CHAR ();
↑458 }
↑459
↑460 if (!state_mask & URI_DECODE_STATE_PATH || 0 == uri->path.used)
↑461 return -1;
↑462
↑463 if (strange_path) {
↑464 if (0 != uri_decode_strange_path (uri,
↑465 uri->path.data, uri->path.used))
↑466 {
↑467 return -1;
↑468 }
↑469 }
↑470
↑471 /* do something more if user/pass/host/port was given */
↑472 if (state_mask & (URI_DECODE_STATE_USER | URI_DECODE_STATE_PASS |
↑473 URI_DECODE_STATE_HOST | URI_DECODE_STATE_PORT) )
↑474 {
↑475 if (0 == (state_mask & (URI_DECODE_STATE_HOST |
↑476 URI_DECODE_STATE_PORT)) )
↑477 {
↑478 /* only host[:port] was given but this was
↑479 * "wrongly" stored in user:pass, correct that. */
↑480
↑481 host = uri->user;
↑482 port = uri->pass;
↑483
↑484 rh_buffer_destroy (&uri->user);
↑485 rh_buffer_destroy (&uri->pass);
↑486
↑487 if (state_mask & URI_DECODE_STATE_USER) {
↑488 state_mask &= ~URI_DECODE_STATE_USER;
↑489 state_mask |= URI_DECODE_STATE_HOST;
↑490 }
↑491
↑492 if (state_mask & URI_DECODE_STATE_PASS) {
↑493 state_mask &= ~URI_DECODE_STATE_PASS;
↑494 state_mask |= URI_DECODE_STATE_PORT;
↑495 }
↑496 }
↑497
↑498 if (state_mask & URI_DECODE_STATE_USER && 0 == uri->user.used)
↑499 return -1;
↑500
↑501 if (state_mask & URI_DECODE_STATE_PASS && 0 == uri->pass.used)
↑502 return -1;
↑503
↑504 /* validate/lowercase host:port */
↑505 if (state_mask & URI_DECODE_STATE_HOST) {
↑506 size_t nhost;
↑507
↑508 /*
↑509 * this uses the whole constant buffer from
↑510 * host to port: "host:12345"
↑511 *
↑512 */
↑513 nhost = host.used;
↑514
↑515 if (state_mask & URI_DECODE_STATE_PORT) {
↑516 if (0 == port.used)
↑517 return -1;
↑518
↑519 nhost += CONST_LEN(":") + port.used;
↑520 }
↑521
↑522 /* parse host:port */
↑523 if (0 != http_host_parse(&uri->host, host.data, nhost))
↑524 return -1;
↑525 }
↑526 }
↑527
↑528 uri->decode_state = state_mask;
↑529
↑530 return 0;
↑531
↑532 #undef IGN_CHAR
↑533 #undef USE_CHAR
↑534 #undef DEST
↑535 }
↑536
↑537
↑538 /*
↑539 *
↑540 * removes strange paths parts from the uri->path component.
↑541 *
↑542 * if something is wrong (eg: "/dir/../../../whatever") this functions
↑543 * returns -1
↑544 *
↑545 *
↑546 * XXX: Win32: "/\..\/" (or something like that)
↑547 *
↑548 */
↑549
↑550 static int uri_decode_strange_path (uri_t *uri, const char *src, size_t nsrc)
↑551 {
↑552 char *dst;
↑553 size_t ndst;
↑554
↑555 /* remove: "//", "/./", "/../"
↑556 * valid: "/.test.html" "/dir/../file.html"
↑557 *
↑558 *
↑559 */
↑560
↑561 dst = uri->path.data;
↑562 ndst = 0;
↑563
↑564 while (nsrc) {
↑565 if ('/' == src[0]) {
↑566 /* remove ALL following "/" */
↑567 for (;nsrc>1 && src[1] == '/';++src,--nsrc);
↑568 #if 0
↑569 printf ("%s(): src[%.*s]\n", __FUNCTION__,
↑570 nsrc, src);
↑571 #endif
↑572
↑573 /* "/./" */
↑574 if (nsrc > 2 && RH_EQUAL2(src+1,"./")) {
↑575 src += 2;
↑576 nsrc -= 2;
↑577 continue;
↑578 }
↑579
↑580 /* "/../" */
↑581 if ( nsrc > 3 &&
↑582 src[1] == '.' &&
↑583 src[2] == '.' &&
↑584 src[3] == '/')
↑585 {
↑586
↑587 /* backward scan for previous dir */
↑588 for (;;) {
↑589 /* running outside the root? */
↑590 if (0 == ndst)
↑591 return -1;
↑592
↑593 --ndst;
↑594
↑595 /* got a previous path? */
↑596 if ('/' == uri->path.data[ndst])
↑597 break;
↑598 }
↑599
↑600 dst = uri->path.data + ndst;
↑601
↑602 src += 3;
↑603 nsrc -= 3;
↑604
↑605 continue;
↑606 }
↑607 }
↑608
↑609 /* copy src into dst */
↑610 *dst++ = *src++;
↑611
↑612 /* correct src/dst size */
↑613 ++ndst;
↑614 --nsrc;
↑615 }
↑616
↑617 uri->path.used = ndst;
↑618
↑619 return 0;
↑620 }
↑621
↑622 /*
↑623 *
↑624 * decode a hex encoded uri to (including) '?'.
↑625 *
↑626 * after the '?' the characters validated against invalid
↑627 * hex encodings, eg: 0xXY
↑628 *
↑629 * if a '#' is found then the fragment is hex decoded.
↑630 *
↑631 * decoded characters are tested against MAP_URI_DECODED if its not
↑632 * the query part.
↑633 *
↑634 */
↑635
↑636 static int uri_decode_hex (rh_buffer_t *buffer, const char *src, size_t nsrc)
↑637 {
↑638 size_t space_left;
↑639 int decode;
↑640
↑641 if (rh_buffer_reserve (buffer, nsrc + 32))
↑642 return -1;
↑643
↑644 decode = 1;
↑645
↑646 if (0 == nsrc)
↑647 return 0;