1 /* 2 * Database - Database abstraction layer for D programing language. 3 * 4 * Copyright (C) 2017 Shanghai Putao Technology Co., Ltd 5 * 6 * Developer: HuntLabs 7 * 8 * Licensed under the Apache-2.0 License. 9 * 10 */ 11 12 module database.url; 13 14 import std.algorithm; 15 import std.array; 16 import std.conv; 17 import std.encoding; 18 import std..string; 19 import std.utf; 20 21 @safe: 22 23 class URLException : Exception { 24 this(string msg) { super(msg); } 25 } 26 27 ushort[string] schemeToDefaultPort; 28 29 static this() 30 { 31 schemeToDefaultPort = [ 32 "aaa": 3868, 33 "aaas": 5658, 34 "acap": 674, 35 "amqp": 5672, 36 "cap": 1026, 37 "coap": 5683, 38 "coaps": 5684, 39 "dav": 443, 40 "dict": 2628, 41 "ftp": 21, 42 "git": 9418, 43 "go": 1096, 44 "gopher": 70, 45 "http": 80, 46 "https": 443, 47 "ws": 80, 48 "wss": 443, 49 "iac": 4569, 50 "icap": 1344, 51 "imap": 143, 52 "ipp": 631, 53 "ipps": 631, // yes, they're both mapped to port 631 54 "irc": 6667, // De facto default port, not the IANA reserved port. 55 "ircs": 6697, 56 "iris": 702, // defaults to iris.beep 57 "iris.beep": 702, 58 "iris.lwz": 715, 59 "iris.xpc": 713, 60 "iris.xpcs": 714, 61 "jabber": 5222, // client-to-server 62 "ldap": 389, 63 "ldaps": 636, 64 "msrp": 2855, 65 "msrps": 2855, 66 "mtqp": 1038, 67 "mupdate": 3905, 68 "news": 119, 69 "nfs": 2049, 70 "pop": 110, 71 "redis": 6379, 72 "reload": 6084, 73 "rsync": 873, 74 "rtmfp": 1935, 75 "rtsp": 554, 76 "shttp": 80, 77 "sieve": 4190, 78 "sip": 5060, 79 "sips": 5061, 80 "smb": 445, 81 "smtp": 25, 82 "snews": 563, 83 "snmp": 161, 84 "soap.beep": 605, 85 "ssh": 22, 86 "stun": 3478, 87 "stuns": 5349, 88 "svn": 3690, 89 "teamspeak": 9987, 90 "telnet": 23, 91 "tftp": 69, 92 "tip": 3372, 93 "mysql": 3306, 94 "postgresql": 5432 95 ]; 96 } 97 98 /** 99 * A collection of query parameters. 100 * 101 * This is effectively a multimap of string -> strings. 102 */ 103 struct QueryParams { 104 import std.typecons; 105 alias Tuple!(string, "key", string, "value") Param; 106 Param[] params; 107 108 @property size_t length() { 109 return params.length; 110 } 111 112 /// Get a range over the query parameter values for the given key. 113 auto opIndex(string key) { 114 return params.find!(x => x.key == key).map!(x => x.value); 115 } 116 117 /// Add a query parameter with the given key and value. 118 /// If one already exists, there will now be two query parameters with the given name. 119 void add(string key, string value) { 120 params ~= Param(key, value); 121 } 122 123 /// Add a query parameter with the given key and value. 124 /// If there are any existing parameters with the same key, they are removed and overwritten. 125 void overwrite(string key, string value) { 126 for (int i = 0; i < params.length; i++) { 127 if (params[i].key == key) { 128 params[i] = params[$-1]; 129 params.length--; 130 } 131 } 132 params ~= Param(key, value); 133 } 134 135 private struct QueryParamRange { 136 size_t i; 137 const(Param)[] params; 138 bool empty() { return i >= params.length; } 139 void popFront() { i++; } 140 Param front() { return params[i]; } 141 } 142 143 /** 144 * A range over the query parameters. 145 * 146 * Usage: 147 * --- 148 * foreach (key, value; url.queryParams) {} 149 * --- 150 */ 151 auto range() { 152 return QueryParamRange(0, this.params); 153 } 154 /// ditto 155 alias range this; 156 } 157 158 /** 159 * A Unique Resource Locator. 160 * 161 * URLs can be parsed (see parseURL) and implicitly convert to strings. 162 */ 163 struct URL { 164 /// The URL scheme. For instance, ssh, ftp, or https. 165 string scheme; 166 167 /// The username in this URL. Usually absent. If present, there will also be a password. 168 string user; 169 170 /// The password in this URL. Usually absent. 171 string pass; 172 173 /// The hostname. 174 string host; 175 176 /** 177 * The port. 178 * 179 * This is inferred from the scheme if it isn't present in the URL itself. 180 * If the scheme is not known and the port is not present, the port will be given as 0. 181 * For some schemes, port will not be sensible -- for instance, file or chrome-extension. 182 * 183 * If you explicitly need to detect whether the user provided a port, check the providedPort 184 * field. 185 */ 186 @property ushort port() { 187 if (providedPort != 0) { 188 return providedPort; 189 } 190 if (auto p = scheme in schemeToDefaultPort) { 191 return *p; 192 } 193 return 0; 194 } 195 196 /** 197 * Set the port. 198 * 199 * This sets the providedPort field and is provided for convenience. 200 */ 201 @property ushort port(ushort value) { 202 return providedPort = value; 203 } 204 205 /// The port that was explicitly provided in the URL. 206 ushort providedPort; 207 208 /** 209 * The path. 210 * 211 * For instance, in the URL https://cnn.com/news/story/17774?visited=false, the path is 212 * "/news/story/17774". 213 */ 214 string path; 215 216 /** 217 * Deprecated: this disallows multiple values for the same query string. Please use queryParams 218 * instead. 219 * 220 * The query string elements. 221 * 222 * For instance, in the URL https://cnn.com/news/story/17774?visited=false, the query string 223 * elements will be ["visited": "false"]. 224 * 225 * Similarly, in the URL https://bbc.co.uk/news?item, the query string elements will be 226 * ["item": ""]. 227 * 228 * This field is mutable, so be cautious. 229 */ 230 string[string] query; 231 232 /** 233 * The query parameters associated with this URL. 234 */ 235 QueryParams queryParams; 236 237 /** 238 * The fragment. In web documents, this typically refers to an anchor element. 239 * For instance, in the URL https://cnn.com/news/story/17774#header2, the fragment is "header2". 240 */ 241 string fragment; 242 243 /** 244 * Convert this URL to a string. 245 * The string is properly formatted and usable for, eg, a web request. 246 */ 247 string toString() { 248 return toString(false); 249 } 250 251 /** 252 * Convert this URL to a string. 253 * The string is intended to be human-readable rather than machine-readable. 254 */ 255 string toHumanReadableString() { 256 return toString(true); 257 } 258 259 private string toString(bool humanReadable) { 260 Appender!string s; 261 s ~= scheme; 262 s ~= "://"; 263 if (user) { 264 s ~= humanReadable ? user : user.percentEncode; 265 s ~= ":"; 266 s ~= humanReadable ? pass : pass.percentEncode; 267 s ~= "@"; 268 } 269 s ~= humanReadable ? host : host.toPuny; 270 if (providedPort) { 271 if ((scheme in schemeToDefaultPort) == null || schemeToDefaultPort[scheme] != providedPort) { 272 s ~= ":"; 273 s ~= providedPort.to!string; 274 } 275 } 276 string p = path; 277 if (p.length == 0 || p == "/") { 278 s ~= '/'; 279 } else { 280 if (p[0] == '/') { 281 p = p[1..$]; 282 } 283 if (humanReadable) { 284 s ~= p; 285 } else { 286 foreach (part; p.split('/')) { 287 s ~= '/'; 288 s ~= part.percentEncode; 289 } 290 } 291 } 292 if (queryParams.length) { 293 bool first = true; 294 s ~= '?'; 295 foreach (k, v; queryParams) { 296 if (!first) { 297 s ~= '&'; 298 } 299 first = false; 300 s ~= k.percentEncode; 301 if (v.length > 0) { 302 s ~= '='; 303 s ~= v.percentEncode; 304 } 305 } 306 } else if (query) { 307 s ~= '?'; 308 bool first = true; 309 foreach (k, v; query) { 310 if (!first) { 311 s ~= '&'; 312 } 313 first = false; 314 s ~= k.percentEncode; 315 if (v.length > 0) { 316 s ~= '='; 317 s ~= v.percentEncode; 318 } 319 } 320 } 321 if (fragment) { 322 s ~= '#'; 323 s ~= fragment.percentEncode; 324 } 325 return s.data; 326 } 327 328 /// Implicitly convert URLs to strings. 329 alias toString this; 330 331 /** 332 * The append operator (~). 333 * 334 * The append operator for URLs returns a new URL with the given string appended as a path 335 * element to the URL's path. It only adds new path elements (or sequences of path elements). 336 * 337 * Don't worry about path separators; whether you include them or not, it will just work. 338 * 339 * Query elements are copied. 340 * 341 * Examples: 342 * --- 343 * auto random = "http://testdata.org/random".parseURL; 344 * auto randInt = random ~ "int"; 345 * writeln(randInt); // prints "http://testdata.org/random/int" 346 * --- 347 */ 348 URL opBinary(string op : "~")(string subsequentPath) { 349 URL other = this; 350 other ~= subsequentPath; 351 if (query) { 352 other.query = other.query.dup; 353 } 354 return other; 355 } 356 357 /** 358 * The append-in-place operator (~=). 359 * 360 * The append operator for URLs adds a path element to this URL. It only adds new path elements 361 * (or sequences of path elements). 362 * 363 * Don't worry about path separators; whether you include them or not, it will just work. 364 * 365 * Examples: 366 * --- 367 * auto random = "http://testdata.org/random".parseURL; 368 * random ~= "int"; 369 * writeln(random); // prints "http://testdata.org/random/int" 370 * --- 371 */ 372 URL opOpAssign(string op : "~")(string subsequentPath) { 373 if (path.endsWith("/")) { 374 if (subsequentPath.startsWith("/")) { 375 path ~= subsequentPath[1..$]; 376 } else { 377 path ~= subsequentPath; 378 } 379 } else { 380 if (!subsequentPath.startsWith("/")) { 381 path ~= '/'; 382 } 383 path ~= subsequentPath; 384 } 385 return this; 386 } 387 } 388 389 /** 390 * Parse a URL from a string. 391 * 392 * This attempts to parse a wide range of URLs as people might actually type them. Some mistakes 393 * may be made. However, any URL in a correct format will be parsed correctly. 394 */ 395 bool tryParseURL(string value, out URL url) { 396 url = URL.init; 397 // scheme:[//[user:password@]host[:port]][/]path[?query][#fragment] 398 // Scheme is optional in common use. We infer 'http' if it's not given. 399 auto i = value.indexOf("//"); 400 if (i > -1) { 401 if (i > 1) { 402 url.scheme = value[0..i-1]; 403 } 404 value = value[i+2 .. $]; 405 } else { 406 url.scheme = "http"; 407 } 408 // [user:password@]host[:port]][/]path[?query][#fragment 409 i = value.indexOfAny([':', '/']); 410 if (i == -1) { 411 // Just a hostname. 412 url.host = value.fromPuny; 413 return true; 414 } 415 416 if (value[i] == ':') { 417 // This could be between username and password, or it could be between host and port. 418 auto j = value.indexOfAny(['@', '/']); 419 if (j > -1 && value[j] == '@') { 420 try { 421 url.user = value[0..i].percentDecode; 422 url.pass = value[i+1 .. j].percentDecode; 423 } catch (URLException) { 424 return false; 425 } 426 value = value[j+1 .. $]; 427 } 428 } 429 430 // It's trying to be a host/port, not a user/pass. 431 i = value.indexOfAny([':', '/']); 432 if (i == -1) { 433 url.host = value.fromPuny; 434 return true; 435 } 436 url.host = value[0..i].fromPuny; 437 value = value[i .. $]; 438 if (value[0] == ':') { 439 auto end = value.indexOf('/'); 440 if (end == -1) { 441 end = value.length; 442 } 443 try { 444 url.port = value[1 .. end].to!ushort; 445 } catch (ConvException) { 446 return false; 447 } 448 value = value[end .. $]; 449 if (value.length == 0) { 450 return true; 451 } 452 } 453 454 i = value.indexOfAny("?#"); 455 if (i == -1) { 456 url.path = value.percentDecode; 457 return true; 458 } 459 460 try { 461 url.path = value[0..i].percentDecode; 462 } catch (URLException) { 463 return false; 464 } 465 auto c = value[i]; 466 value = value[i + 1 .. $]; 467 if (c == '?') { 468 i = value.indexOf('#'); 469 string query; 470 if (i < 0) { 471 query = value; 472 value = null; 473 } else { 474 query = value[0..i]; 475 value = value[i + 1 .. $]; 476 } 477 auto queries = query.split('&'); 478 foreach (q; queries) { 479 auto j = q.indexOf('='); 480 string key, val; 481 if (j < 0) { 482 key = q; 483 } else { 484 key = q[0..j]; 485 val = q[j + 1 .. $]; 486 } 487 try { 488 key = key.percentDecode; 489 val = val.percentDecode; 490 } catch (URLException) { 491 return false; 492 } 493 url.query[key] = val; 494 url.queryParams.add(key, val); 495 } 496 } 497 498 try { 499 url.fragment = value.percentDecode; 500 } catch (URLException) { 501 return false; 502 } 503 504 return true; 505 } 506 507 unittest { 508 { 509 // Basic. 510 URL url; 511 with (url) { 512 scheme = "https"; 513 host = "example.org"; 514 path = "/foo/bar"; 515 query["hello"] = "world"; 516 query["gibe"] = "clay"; 517 fragment = "frag"; 518 } 519 assert( 520 // Not sure what order it'll come out in. 521 url.toString == "https://example.org/foo/bar?hello=world&gibe=clay#frag" || 522 url.toString == "https://example.org/foo/bar?gibe=clay&hello=world#frag", 523 url.toString); 524 } 525 { 526 // Percent encoded. 527 URL url; 528 with (url) { 529 scheme = "https"; 530 host = "example.org"; 531 path = "/f☃o"; 532 query["❄"] = "❀"; 533 query["["] = "]"; 534 fragment = "ş"; 535 } 536 assert( 537 // Not sure what order it'll come out in. 538 url.toString == "https://example.org/f%E2%98%83o?%E2%9D%84=%E2%9D%80&%5B=%5D#%C5%9F" || 539 url.toString == "https://example.org/f%E2%98%83o?%5B=%5D&%E2%9D%84=%E2%9D%80#%C5%9F", 540 url.toString); 541 } 542 { 543 // Port, user, pass. 544 URL url; 545 with (url) { 546 scheme = "https"; 547 host = "example.org"; 548 user = "dhasenan"; 549 pass = "itsasecret"; 550 port = 17; 551 } 552 assert( 553 url.toString == "https://dhasenan:itsasecret@example.org:17/", 554 url.toString); 555 } 556 { 557 // Query with no path. 558 URL url; 559 with (url) { 560 scheme = "https"; 561 host = "example.org"; 562 query["hi"] = "bye"; 563 } 564 assert( 565 url.toString == "https://example.org/?hi=bye", 566 url.toString); 567 } 568 } 569 570 unittest 571 { 572 auto url = "//foo/bar".parseURL; 573 assert(url.host == "foo", "expected host foo, got " ~ url.host); 574 assert(url.path == "/bar"); 575 } 576 577 unittest 578 { 579 auto url = "localhost:5984".parseURL; 580 auto url2 = url ~ "db1"; 581 assert(url2.toString == "http://localhost:5984/db1", url2.toString); 582 auto url3 = url2 ~ "_all_docs"; 583 assert(url3.toString == "http://localhost:5984/db1/_all_docs", url3.toString); 584 } 585 586 /// 587 unittest { 588 { 589 // Basic. 590 URL url; 591 with (url) { 592 scheme = "https"; 593 host = "example.org"; 594 path = "/foo/bar"; 595 queryParams.add("hello", "world"); 596 queryParams.add("gibe", "clay"); 597 fragment = "frag"; 598 } 599 assert( 600 // Not sure what order it'll come out in. 601 url.toString == "https://example.org/foo/bar?hello=world&gibe=clay#frag" || 602 url.toString == "https://example.org/foo/bar?gibe=clay&hello=world#frag", 603 url.toString); 604 } 605 { 606 // Passing an array of query values. 607 URL url; 608 with (url) { 609 scheme = "https"; 610 host = "example.org"; 611 path = "/foo/bar"; 612 queryParams.add("hello", "world"); 613 queryParams.add("hello", "aether"); 614 fragment = "frag"; 615 } 616 assert( 617 // Not sure what order it'll come out in. 618 url.toString == "https://example.org/foo/bar?hello=world&hello=aether#frag" || 619 url.toString == "https://example.org/foo/bar?hello=aether&hello=world#frag", 620 url.toString); 621 } 622 { 623 // Percent encoded. 624 URL url; 625 with (url) { 626 scheme = "https"; 627 host = "example.org"; 628 path = "/f☃o"; 629 queryParams.add("❄", "❀"); 630 queryParams.add("[", "]"); 631 fragment = "ş"; 632 } 633 assert( 634 // Not sure what order it'll come out in. 635 url.toString == "https://example.org/f%E2%98%83o?%E2%9D%84=%E2%9D%80&%5B=%5D#%C5%9F" || 636 url.toString == "https://example.org/f%E2%98%83o?%5B=%5D&%E2%9D%84=%E2%9D%80#%C5%9F", 637 url.toString); 638 } 639 { 640 // Port, user, pass. 641 URL url; 642 with (url) { 643 scheme = "https"; 644 host = "example.org"; 645 user = "dhasenan"; 646 pass = "itsasecret"; 647 port = 17; 648 } 649 assert( 650 url.toString == "https://dhasenan:itsasecret@example.org:17/", 651 url.toString); 652 } 653 { 654 // Query with no path. 655 URL url; 656 with (url) { 657 scheme = "https"; 658 host = "example.org"; 659 queryParams.add("hi", "bye"); 660 } 661 assert( 662 url.toString == "https://example.org/?hi=bye", 663 url.toString); 664 } 665 } 666 667 unittest { 668 // Percent decoding. 669 670 // http://#:!:@ 671 auto urlString = "http://%23:%21%3A@example.org/%7B/%7D?%3B&%26=%3D#%23hash"; 672 auto url = urlString.parseURL; 673 assert(url.user == "#"); 674 assert(url.pass == "!:"); 675 assert(url.host == "example.org"); 676 assert(url.path == "/{/}"); 677 assert(url.queryParams[";"].front == ""); 678 assert(url.queryParams["&"].front == "="); 679 assert(url.fragment == "#hash"); 680 681 // Round trip. 682 assert(urlString == urlString.parseURL.toString, urlString.parseURL.toString); 683 assert(urlString == urlString.parseURL.toString.parseURL.toString); 684 } 685 686 unittest { 687 auto url = "https://xn--m3h.xn--n3h.org/?hi=bye".parseURL; 688 assert(url.host == "☂.☃.org", url.host); 689 } 690 691 unittest { 692 auto url = "https://xn--m3h.xn--n3h.org/?hi=bye".parseURL; 693 assert(url.toString == "https://xn--m3h.xn--n3h.org/?hi=bye", url.toString); 694 assert(url.toHumanReadableString == "https://☂.☃.org/?hi=bye", url.toString); 695 } 696 697 unittest { 698 auto url = "https://☂.☃.org/?hi=bye".parseURL; 699 assert(url.toString == "https://xn--m3h.xn--n3h.org/?hi=bye"); 700 } 701 702 /// 703 unittest { 704 // There's an existing path. 705 auto url = parseURL("http://example.org/foo"); 706 URL url2; 707 // No slash? Assume it needs a slash. 708 assert((url ~ "bar").toString == "http://example.org/foo/bar"); 709 // With slash? Don't add another. 710 url2 = url ~ "/bar"; 711 assert(url2.toString == "http://example.org/foo/bar", url2.toString); 712 url ~= "bar"; 713 assert(url.toString == "http://example.org/foo/bar"); 714 715 // Path already ends with a slash; don't add another. 716 url = parseURL("http://example.org/foo/"); 717 assert((url ~ "bar").toString == "http://example.org/foo/bar"); 718 // Still don't add one even if you're appending with a slash. 719 assert((url ~ "/bar").toString == "http://example.org/foo/bar"); 720 url ~= "/bar"; 721 assert(url.toString == "http://example.org/foo/bar"); 722 723 // No path. 724 url = parseURL("http://example.org"); 725 assert((url ~ "bar").toString == "http://example.org/bar"); 726 assert((url ~ "/bar").toString == "http://example.org/bar"); 727 url ~= "bar"; 728 assert(url.toString == "http://example.org/bar"); 729 730 // Path is just a slash. 731 url = parseURL("http://example.org/"); 732 assert((url ~ "bar").toString == "http://example.org/bar"); 733 assert((url ~ "/bar").toString == "http://example.org/bar"); 734 url ~= "bar"; 735 assert(url.toString == "http://example.org/bar", url.toString); 736 737 // No path, just fragment. 738 url = "ircs://irc.freenode.com/#d".parseURL; 739 assert(url.toString == "ircs://irc.freenode.com/#d", url.toString); 740 } 741 742 unittest { 743 import std.net.curl; 744 auto url = "http://example.org".parseURL; 745 assert(is(typeof(std.net.curl.get(url)))); 746 } 747 748 /** 749 * Parse the input string as a URL. 750 * 751 * Throws: 752 * URLException if the string was in an incorrect format. 753 */ 754 URL parseURL(string value) { 755 URL url; 756 if (tryParseURL(value, url)) { 757 return url; 758 } 759 throw new URLException("failed to parse URL " ~ value); 760 } 761 762 /// 763 unittest { 764 { 765 // Infer scheme 766 auto u1 = parseURL("example.org"); 767 assert(u1.scheme == "http"); 768 assert(u1.host == "example.org"); 769 assert(u1.path == ""); 770 assert(u1.port == 80); 771 assert(u1.providedPort == 0); 772 assert(u1.fragment == ""); 773 } 774 { 775 // Simple host and scheme 776 auto u1 = parseURL("https://example.org"); 777 assert(u1.scheme == "https"); 778 assert(u1.host == "example.org"); 779 assert(u1.path == ""); 780 assert(u1.port == 443); 781 assert(u1.providedPort == 0); 782 } 783 { 784 // With path 785 auto u1 = parseURL("https://example.org/foo/bar"); 786 assert(u1.scheme == "https"); 787 assert(u1.host == "example.org"); 788 assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path); 789 assert(u1.port == 443); 790 assert(u1.providedPort == 0); 791 } 792 { 793 // With explicit port 794 auto u1 = parseURL("https://example.org:1021/foo/bar"); 795 assert(u1.scheme == "https"); 796 assert(u1.host == "example.org"); 797 assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path); 798 assert(u1.port == 1021); 799 assert(u1.providedPort == 1021); 800 } 801 { 802 // With user 803 auto u1 = parseURL("https://bob:secret@example.org/foo/bar"); 804 assert(u1.scheme == "https"); 805 assert(u1.host == "example.org"); 806 assert(u1.path == "/foo/bar"); 807 assert(u1.port == 443); 808 assert(u1.user == "bob"); 809 assert(u1.pass == "secret"); 810 } 811 { 812 // With user, URL-encoded 813 auto u1 = parseURL("https://bob%21:secret%21%3F@example.org/foo/bar"); 814 assert(u1.scheme == "https"); 815 assert(u1.host == "example.org"); 816 assert(u1.path == "/foo/bar"); 817 assert(u1.port == 443); 818 assert(u1.user == "bob!"); 819 assert(u1.pass == "secret!?"); 820 } 821 { 822 // With user and port and path 823 auto u1 = parseURL("https://bob:secret@example.org:2210/foo/bar"); 824 assert(u1.scheme == "https"); 825 assert(u1.host == "example.org"); 826 assert(u1.path == "/foo/bar"); 827 assert(u1.port == 2210); 828 assert(u1.user == "bob"); 829 assert(u1.pass == "secret"); 830 assert(u1.fragment == ""); 831 } 832 { 833 // With query string 834 auto u1 = parseURL("https://example.org/?login=true"); 835 assert(u1.scheme == "https"); 836 assert(u1.host == "example.org"); 837 assert(u1.path == "/", "expected path: / actual path: " ~ u1.path); 838 assert(u1.queryParams["login"].front == "true"); 839 assert(u1.fragment == ""); 840 } 841 { 842 // With query string and fragment 843 auto u1 = parseURL("https://example.org/?login=true#justkidding"); 844 assert(u1.scheme == "https"); 845 assert(u1.host == "example.org"); 846 assert(u1.path == "/", "expected path: / actual path: " ~ u1.path); 847 assert(u1.queryParams["login"].front == "true"); 848 assert(u1.fragment == "justkidding"); 849 } 850 { 851 // With URL-encoded values 852 auto u1 = parseURL("https://example.org/%E2%98%83?%E2%9D%84=%3D#%5E"); 853 assert(u1.scheme == "https"); 854 assert(u1.host == "example.org"); 855 assert(u1.path == "/☃", "expected path: /☃ actual path: " ~ u1.path); 856 assert(u1.queryParams["❄"].front == "="); 857 assert(u1.fragment == "^"); 858 } 859 } 860 861 unittest { 862 assert(parseURL("http://example.org").port == 80); 863 assert(parseURL("http://example.org:5326").port == 5326); 864 865 auto url = parseURL("redis://admin:password@redisbox.local:2201/path?query=value#fragment"); 866 assert(url.scheme == "redis"); 867 assert(url.user == "admin"); 868 assert(url.pass == "password"); 869 870 assert(parseURL("example.org").toString == "http://example.org/"); 871 assert(parseURL("http://example.org:80").toString == "http://example.org/"); 872 873 assert(parseURL("localhost:8070").toString == "http://localhost:8070/"); 874 } 875 876 /** 877 * Percent-encode a string. 878 * 879 * URL components cannot contain non-ASCII characters, and there are very few characters that are 880 * safe to include as URL components. Domain names using Unicode values use Punycode. For 881 * everything else, there is percent encoding. 882 */ 883 string percentEncode(string raw) { 884 // We *must* encode these characters: :/?#[]@!$&'()*+,;=" 885 // We *can* encode any other characters. 886 // We *should not* encode alpha, numeric, or -._~. 887 Appender!string app; 888 foreach (dchar d; raw) { 889 if (('a' <= d && 'z' >= d) || 890 ('A' <= d && 'Z' >= d) || 891 ('0' <= d && '9' >= d) || 892 d == '-' || d == '.' || d == '_' || d == '~') { 893 app ~= d; 894 continue; 895 } 896 // Something simple like a space character? Still in 7-bit ASCII? 897 // Then we get a single-character string out of it and just encode 898 // that one bit. 899 // Something not in 7-bit ASCII? Then we percent-encode each octet 900 // in the UTF-8 encoding (and hope the server understands UTF-8). 901 char[] c; 902 encode(c, d); 903 auto bytes = cast(ubyte[])c; 904 foreach (b; bytes) { 905 app ~= format("%%%02X", b); 906 } 907 } 908 return cast(string)app.data; 909 } 910 911 /// 912 unittest { 913 assert(percentEncode("IDontNeedNoPercentEncoding") == "IDontNeedNoPercentEncoding"); 914 assert(percentEncode("~~--..__") == "~~--..__"); 915 assert(percentEncode("0123456789") == "0123456789"); 916 917 string e; 918 919 e = percentEncode("☃"); 920 assert(e == "%E2%98%83", "expected %E2%98%83 but got" ~ e); 921 } 922 923 /** 924 * Percent-decode a string. 925 * 926 * URL components cannot contain non-ASCII characters, and there are very few characters that are 927 * safe to include as URL components. Domain names using Unicode values use Punycode. For 928 * everything else, there is percent encoding. 929 * 930 * This explicitly ensures that the result is a valid UTF-8 string. 931 */ 932 @trusted string percentDecode(string encoded) { 933 ubyte[] raw = percentDecodeRaw(encoded); 934 // This cast is not considered @safe because it converts from one pointer type to another. 935 // However, it's 1-byte values in either case, no reference types, so this won't result in any 936 // memory safety errors. We also check for validity immediately. 937 auto s = cast(string) raw; 938 if (!s.isValid) { 939 // TODO(dhasenan): 940 throw new URLException("input contains invalid UTF data"); 941 } 942 return s; 943 } 944 945 /// 946 unittest { 947 assert(percentDecode("IDontNeedNoPercentDecoding") == "IDontNeedNoPercentDecoding"); 948 assert(percentDecode("~~--..__") == "~~--..__"); 949 assert(percentDecode("0123456789") == "0123456789"); 950 951 string e; 952 953 e = percentDecode("%E2%98%83"); 954 assert(e == "☃", "expected a snowman but got" ~ e); 955 } 956 957 /** 958 * Percent-decode a string into a ubyte array. 959 * 960 * URL components cannot contain non-ASCII characters, and there are very few characters that are 961 * safe to include as URL components. Domain names using Unicode values use Punycode. For 962 * everything else, there is percent encoding. 963 * 964 * This yields a ubyte array and will not perform validation on the output. However, an improperly 965 * formatted input string will result in a URLException. 966 */ 967 ubyte[] percentDecodeRaw(string encoded) { 968 // We're dealing with possibly incorrectly encoded UTF-8. Mark it down as ubyte[] for now. 969 Appender!(ubyte[]) app; 970 for (int i = 0; i < encoded.length; i++) { 971 if (encoded[i] != '%') { 972 app ~= encoded[i]; 973 continue; 974 } 975 if (i >= encoded.length - 2) { 976 throw new URLException("Invalid percent encoded value: expected two characters after " ~ 977 "percent symbol. Error at index " ~ i.to!string); 978 } 979 auto b = cast(ubyte)("0123456789ABCDEF".indexOf(encoded[i + 1])); 980 auto c = cast(ubyte)("0123456789ABCDEF".indexOf(encoded[i + 2])); 981 app ~= cast(ubyte)((b << 4) | c); 982 i += 2; 983 } 984 return app.data; 985 } 986 987 private string toPuny(string unicodeHostname) { 988 bool mustEncode = false; 989 foreach (i, dchar d; unicodeHostname) { 990 auto c = cast(uint) d; 991 if (c > 0x80) { 992 mustEncode = true; 993 break; 994 } 995 if (c < 0x2C || (c >= 0x3A && c <= 40) || (c >= 0x5B && c <= 0x60) || (c >= 0x7B)) { 996 throw new URLException( 997 format( 998 "domain name '%s' contains illegal character '%s' at position %s", 999 unicodeHostname, d, i)); 1000 } 1001 } 1002 if (!mustEncode) { 1003 return unicodeHostname; 1004 } 1005 return unicodeHostname.split('.').map!punyEncode.join("."); 1006 } 1007 1008 private string fromPuny(string hostname) { 1009 return hostname.split('.').map!punyDecode.join("."); 1010 } 1011 1012 private { 1013 enum delimiter = '-'; 1014 enum marker = "xn--"; 1015 enum ulong damp = 700; 1016 enum ulong tmin = 1; 1017 enum ulong tmax = 26; 1018 enum ulong skew = 38; 1019 enum ulong base = 36; 1020 enum ulong initialBias = 72; 1021 enum dchar initialN = cast(dchar)128; 1022 1023 ulong adapt(ulong delta, ulong numPoints, bool firstTime) { 1024 if (firstTime) { 1025 delta /= damp; 1026 } else { 1027 delta /= 2; 1028 } 1029 delta += delta / numPoints; 1030 ulong k = 0; 1031 while (delta > ((base - tmin) * tmax) / 2) { 1032 delta /= (base - tmin); 1033 k += base; 1034 } 1035 return k + (((base - tmin + 1) * delta) / (delta + skew)); 1036 } 1037 } 1038 1039 /** 1040 * Encode the input string using the Punycode algorithm. 1041 * 1042 * Punycode is used to encode UTF domain name segment. A Punycode-encoded segment will be marked 1043 * with "xn--". Each segment is encoded separately. For instance, if you wish to encode "☂.☃.com" 1044 * in Punycode, you will get "xn--m3h.xn--n3h.com". 1045 * 1046 * In order to puny-encode a domain name, you must split it into its components. The following will 1047 * typically suffice: 1048 * --- 1049 * auto domain = "☂.☃.com"; 1050 * auto encodedDomain = domain.splitter(".").map!(punyEncode).join("."); 1051 * --- 1052 */ 1053 string punyEncode(string input) { 1054 ulong delta = 0; 1055 dchar n = initialN; 1056 auto i = 0; 1057 auto bias = initialBias; 1058 Appender!string output; 1059 output ~= marker; 1060 auto pushed = 0; 1061 auto codePoints = 0; 1062 foreach (dchar c; input) { 1063 codePoints++; 1064 if (c <= initialN) { 1065 output ~= c; 1066 pushed++; 1067 } 1068 } 1069 if (pushed < codePoints) { 1070 if (pushed > 0) { 1071 output ~= delimiter; 1072 } 1073 } else { 1074 // No encoding to do. 1075 return input; 1076 } 1077 bool first = true; 1078 while (pushed < codePoints) { 1079 auto best = dchar.max; 1080 foreach (dchar c; input) { 1081 if (n <= c && c < best) { 1082 best = c; 1083 } 1084 } 1085 if (best == dchar.max) { 1086 throw new URLException("failed to find a new codepoint to process during punyencode"); 1087 } 1088 delta += (best - n) * (pushed + 1); 1089 if (delta > uint.max) { 1090 // TODO better error message 1091 throw new URLException("overflow during punyencode"); 1092 } 1093 n = best; 1094 foreach (dchar c; input) { 1095 if (c < n) { 1096 delta++; 1097 } 1098 if (c == n) { 1099 ulong q = delta; 1100 auto k = base; 1101 while (true) { 1102 ulong t; 1103 if (k <= bias) { 1104 t = tmin; 1105 } else if (k >= bias + tmax) { 1106 t = tmax; 1107 } else { 1108 t = k - bias; 1109 } 1110 if (q < t) { 1111 break; 1112 } 1113 output ~= digitToBasic(t + ((q - t) % (base - t))); 1114 q = (q - t) / (base - t); 1115 k += base; 1116 } 1117 output ~= digitToBasic(q); 1118 pushed++; 1119 bias = adapt(delta, pushed, first); 1120 first = false; 1121 delta = 0; 1122 } 1123 } 1124 delta++; 1125 n++; 1126 } 1127 return cast(string)output.data; 1128 } 1129 1130 /** 1131 * Decode the input string using the Punycode algorithm. 1132 * 1133 * Punycode is used to encode UTF domain name segment. A Punycode-encoded segment will be marked 1134 * with "xn--". Each segment is encoded separately. For instance, if you wish to encode "☂.☃.com" 1135 * in Punycode, you will get "xn--m3h.xn--n3h.com". 1136 * 1137 * In order to puny-decode a domain name, you must split it into its components. The following will 1138 * typically suffice: 1139 * --- 1140 * auto domain = "xn--m3h.xn--n3h.com"; 1141 * auto decodedDomain = domain.splitter(".").map!(punyDecode).join("."); 1142 * --- 1143 */ 1144 string punyDecode(string input) { 1145 if (!input.startsWith(marker)) { 1146 return input; 1147 } 1148 input = input[marker.length..$]; 1149 1150 // let n = initial_n 1151 dchar n = cast(dchar)128; 1152 1153 // let i = 0 1154 // let bias = initial_bias 1155 // let output = an empty string indexed from 0 1156 size_t i = 0; 1157 auto bias = initialBias; 1158 dchar[] output; 1159 // This reserves a bit more than necessary, but it should be more efficient overall than just 1160 // appending and inserting volo-nolo. 1161 output.reserve(input.length); 1162 1163 // consume all code points before the last delimiter (if there is one) 1164 // and copy them to output, fail on any non-basic code point 1165 // if more than zero code points were consumed then consume one more 1166 // (which will be the last delimiter) 1167 auto end = input.lastIndexOf(delimiter); 1168 if (end > -1) { 1169 foreach (dchar c; input[0..end]) { 1170 output ~= c; 1171 } 1172 input = input[end+1 .. $]; 1173 } 1174 1175 // while the input is not exhausted do begin 1176 size_t pos = 0; 1177 while (pos < input.length) { 1178 // let oldi = i 1179 // let w = 1 1180 auto oldi = i; 1181 auto w = 1; 1182 // for k = base to infinity in steps of base do begin 1183 for (ulong k = base; k < uint.max; k += base) { 1184 // consume a code point, or fail if there was none to consume 1185 // Note that the input is all ASCII, so we can simply index the input string bytewise. 1186 auto c = input[pos]; 1187 pos++; 1188 // let digit = the code point's digit-value, fail if it has none 1189 auto digit = basicToDigit(c); 1190 // let i = i + digit * w, fail on overflow 1191 i += digit * w; 1192 // let t = tmin if k <= bias {+ tmin}, or 1193 // tmax if k >= bias + tmax, or k - bias otherwise 1194 ulong t; 1195 if (k <= bias) { 1196 t = tmin; 1197 } else if (k >= bias + tmax) { 1198 t = tmax; 1199 } else { 1200 t = k - bias; 1201 } 1202 // if digit < t then break 1203 if (digit < t) { 1204 break; 1205 } 1206 // let w = w * (base - t), fail on overflow 1207 w *= (base - t); 1208 // end 1209 } 1210 // let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?) 1211 bias = adapt(i - oldi, output.length + 1, oldi == 0); 1212 // let n = n + i div (length(output) + 1), fail on overflow 1213 n += i / (output.length + 1); 1214 // let i = i mod (length(output) + 1) 1215 i %= (output.length + 1); 1216 // {if n is a basic code point then fail} 1217 // (We aren't actually going to fail here; it's clear what this means.) 1218 // insert n into output at position i 1219 (() @trusted { output.insertInPlace(i, cast(dchar)n); })(); // should be @safe but isn't marked 1220 // increment i 1221 i++; 1222 // end 1223 } 1224 return output.to!string; 1225 } 1226 1227 // Lifted from punycode.js. 1228 private dchar digitToBasic(ulong digit) { 1229 return cast(dchar)(digit + 22 + 75 * (digit < 26)); 1230 } 1231 1232 // Lifted from punycode.js. 1233 private uint basicToDigit(char c) { 1234 auto codePoint = cast(uint)c; 1235 if (codePoint - 48 < 10) { 1236 return codePoint - 22; 1237 } 1238 if (codePoint - 65 < 26) { 1239 return codePoint - 65; 1240 } 1241 if (codePoint - 97 < 26) { 1242 return codePoint - 97; 1243 } 1244 return base; 1245 } 1246 1247 unittest { 1248 { 1249 auto a = "b\u00FCcher"; 1250 assert(punyEncode(a) == "xn--bcher-kva"); 1251 } 1252 { 1253 auto a = "b\u00FCc\u00FCher"; 1254 assert(punyEncode(a) == "xn--bcher-kvab"); 1255 } 1256 { 1257 auto a = "ýbücher"; 1258 auto b = punyEncode(a); 1259 assert(b == "xn--bcher-kvaf", b); 1260 } 1261 1262 { 1263 auto a = "mañana"; 1264 assert(punyEncode(a) == "xn--maana-pta"); 1265 } 1266 1267 { 1268 auto a = "\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644" 1269 ~ "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F"; 1270 auto b = punyEncode(a); 1271 assert(b == "xn--egbpdaj6bu4bxfgehfvwxn", b); 1272 } 1273 import std.stdio; 1274 } 1275 1276 unittest { 1277 { 1278 auto b = punyDecode("xn--egbpdaj6bu4bxfgehfvwxn"); 1279 assert(b == "ليهمابتكلموشعربي؟", b); 1280 } 1281 { 1282 assert(punyDecode("xn--maana-pta") == "mañana"); 1283 } 1284 } 1285 1286 unittest { 1287 import std..string, std.algorithm, std.array, std.range; 1288 { 1289 auto domain = "xn--m3h.xn--n3h.com"; 1290 auto decodedDomain = domain.splitter(".").map!(punyDecode).join("."); 1291 assert(decodedDomain == "☂.☃.com", decodedDomain); 1292 } 1293 { 1294 auto domain = "☂.☃.com"; 1295 auto decodedDomain = domain.splitter(".").map!(punyEncode).join("."); 1296 assert(decodedDomain == "xn--m3h.xn--n3h.com", decodedDomain); 1297 } 1298 }