1 /** 2 * Some helper functions to work with strings 3 */ 4 module bc..string.string; 5 6 import bc.core.intrinsics; 7 import bc.core.memory : enforceMalloc, enforceRealloc, heapAlloc, heapDealloc; 8 import std.range : ElementEncodingType, hasLength, isInputRange; 9 import std.traits : ForeachType, isSomeChar, isSomeString, isStaticArray, Unqual; 10 // debug import core.stdc.stdio; 11 12 nothrow @nogc: 13 14 alias CString = const(char)[]; 15 16 template isAcceptableString(S) 17 { 18 enum isAcceptableString = 19 (isInputRange!S || isSomeString!S || isStaticArray!S) && 20 isSomeChar!(ElementEncodingType!S); 21 } 22 23 /** 24 * Temporary string buffer. 25 * It can be used to build temporary \0 ended C strings. 26 * For lengths < 255, it uses static char array, mallocated buffer otherwise. 27 * 28 * NOTE: be careful that pointer becomes invalid as soon as the struct comes out of scope! 29 * NOTE: inspired by std.internal.cstring.TempCStringBuffer in Phobos library 30 */ 31 struct TempCString(C) 32 { 33 @trusted pure nothrow @nogc: 34 35 @disable this(); 36 @disable this(this); 37 alias ptr this; 38 39 @property inout(C)* bufPtr() inout 40 { 41 return _ptr == useStack ? _buf.ptr : _ptr; 42 } 43 44 @property const(C)* ptr() const { return bufPtr; } 45 const(C)[] opIndex() const pure { return bufPtr[0 .. _length]; } 46 47 ~this() 48 { 49 if (_ptr != useStack) 50 { 51 import core.memory : pureFree; 52 pureFree(_ptr); 53 } 54 } 55 56 private: 57 C* _ptr; 58 size_t _length; 59 C[256] _buf; 60 61 enum C* useStack = () @trusted { return cast(C*)size_t.max; }(); 62 static TempCString initialize() { TempCString res = void; return res; } 63 } 64 65 /// ditto 66 auto tempCString(C = char, S)(scope S str) if (isAcceptableString!S) 67 { 68 alias CF = Unqual!(ElementEncodingType!S); 69 auto res = TempCString!C.initialize(); 70 71 static if (isSomeString!S) 72 { 73 if (str is null) 74 { 75 res._length = 0; 76 res._ptr = null; 77 return res; 78 } 79 } 80 81 static if (C.sizeof == CF.sizeof && is(typeof(res._buf[0 .. str.length] = str[]))) 82 { 83 if (str.length < res._buf.length) 84 { 85 res._buf[0..str.length] = str[]; 86 res._buf[str.length] = 0; 87 res._ptr = res.useStack; 88 } 89 else 90 { 91 res._ptr = () @trusted { 92 auto p = cast(C*)enforceMalloc((str.length + 1) * C.sizeof); 93 p[0 .. str.length] = str[]; 94 p[str.length] = 0; 95 return cast(C*)p; 96 }(); 97 } 98 res._length = str.length; 99 return res; 100 } 101 else 102 { 103 static assert(!(isSomeString!S && CF.sizeof == C.sizeof), "Should be using slice assignment."); 104 C[] p = res._buf; 105 size_t i; 106 107 size_t strLength; 108 static if (hasLength!S) strLength = str.length; 109 110 import bc.internal.utf : byUTF; 111 static if (isSomeString!S) 112 auto r = cast(const(CF)[])str; // because inout(CF) causes problems with byUTF 113 else 114 alias r = str; 115 116 C[] heapBuffer; 117 foreach (const c; r.byUTF!(Unqual!C)) 118 { 119 if (i + 1 == p.length) 120 { 121 heapBuffer = trustedRealloc(p, strLength, heapBuffer is null); 122 p = heapBuffer; 123 } 124 p[i++] = c; 125 } 126 p[i] = 0; 127 res._length = i; 128 res._ptr = (heapBuffer is null ? res.useStack : &heapBuffer[0]); 129 return res; 130 } 131 } 132 133 /// 134 @("tempCString") 135 nothrow @nogc @system unittest 136 { 137 import core.stdc.string : strlen; 138 139 string str = "abc"; 140 141 // Intended usage 142 assert(strlen(str.tempCString()) == 3); 143 144 // Correct usage 145 auto tmp = str.tempCString(); 146 assert(strlen(tmp) == 3); // or `tmp.ptr`, or `tmp.buffPtr` 147 148 // $(RED WARNING): $(RED Incorrect usage) 149 auto pInvalid1 = str.tempCString().ptr; 150 const char* pInvalid2 = str.tempCString(); 151 } 152 153 @("tempCString - char, wchar, dchar") 154 nothrow @nogc @trusted unittest 155 { 156 import std.algorithm : filter; 157 import bc.internal.utf : byCodeUnit; 158 159 { 160 auto tmp = "baz".byCodeUnit.filter!(a => a == 'z').tempCString; 161 assert(tmp._length == 1); 162 assert(tmp._buf[0] == 'z'); 163 assert(tmp._buf[1] == '\0'); 164 } 165 166 { 167 auto tmp = "baz".byCodeUnit.filter!(a => a == 'z').tempCString!wchar; 168 assert(tmp._length == 1); 169 assert(tmp._buf[0] == 'z'); 170 assert(tmp._buf[1] == '\0'); 171 } 172 173 { 174 auto tmp = "baz".tempCString!dchar; 175 assert(tmp._buf[0..3] == "baz"d); 176 } 177 } 178 179 @("tempCString - static array") 180 nothrow @nogc @trusted unittest 181 { 182 import core.stdc.string : strlen; 183 184 immutable(char)[3] str = "abc"; 185 assert(strlen(str.tempCString()) == 3); 186 } 187 188 /** 189 * Refcounted string implementation. 190 * 191 * It uses malloc for string buffer. 192 * 193 * Types with `RC` prefix are reference counted, so they can be moved around freely. 194 * Types without `RC` prefix has disabled copy constructor and can be only moved (passing ownership) or cloned. 195 * 196 * There are wariants with `W` and `D` before `String` that corresponds to payloads `wchar` and `dchar` as usual. 197 * 198 * Types that ends with `Z` means that they internally manages trailing '\0' and so can be safely used with C interop. 199 * 200 * NOTE: Beware of using exposed data pointer stored before some more content is added to RCString as internal buffer can be reallocated / resized if needed. 201 */ 202 alias RCString = StringImpl!(char, RC.yes, Zero.no); 203 204 /// ditto 205 alias RCWString = StringImpl!(wchar, RC.yes, Zero.no); 206 207 /// ditto 208 alias RCDString = StringImpl!(dchar, RC.yes, Zero.no); 209 210 /// ditto 211 alias RCStringZ = StringImpl!(char, RC.yes, Zero.yes); 212 213 /// ditto 214 alias RCWStringZ = StringImpl!(wchar, RC.yes, Zero.yes); 215 216 /// ditto 217 alias RCDStringZ = StringImpl!(dchar, RC.yes, Zero.yes); 218 219 /** 220 * String with unique ownership implementation. 221 * 222 * Similar to RCString but can be only moved passing it's ownership. 223 * Furthermore it uses 512B stack allocated buffer for short strings. 224 */ 225 alias String = StringImpl!(char, RC.no, Zero.no); 226 227 /// ditto 228 alias WString = StringImpl!(wchar, RC.no, Zero.no); 229 230 /// ditto 231 alias DString = StringImpl!(dchar, RC.no, Zero.no); 232 233 /// ditto 234 alias StringZ = StringImpl!(char, RC.no, Zero.yes); 235 236 /// ditto 237 alias WStringZ = StringImpl!(wchar, RC.no, Zero.yes); 238 239 /// ditto 240 alias DStringZ = StringImpl!(dchar, RC.no, Zero.yes); 241 242 private enum RC { no, yes } 243 private enum Zero { no, yes } 244 245 private struct StringImpl(C, RC rc, Zero zero) 246 { 247 @safe nothrow @nogc: 248 249 static if (zero) enum Z = 1; 250 else enum Z = 0; 251 252 static if (rc) 253 { 254 private 255 { 256 struct Payload 257 { 258 size_t refs; 259 size_t len; 260 C[] buf; 261 262 ~this() @trusted pure nothrow @nogc 263 { 264 import core.memory : pureFree; 265 if (buf) pureFree(buf.ptr); 266 } 267 } 268 269 Payload* pay; 270 } 271 272 /// Copy constructor 273 this(ref return scope inout StringImpl rhs) pure @safe inout 274 { 275 pay = rhs.pay; 276 if (pay) () @trusted { (cast(Payload*)pay).refs++; }(); 277 } 278 279 /// Destructor 280 ~this() 281 { 282 if (pay && --pay.refs == 0) heapDealloc(pay); 283 } 284 } 285 else 286 { 287 private 288 { 289 enum STACK_LEN = 512; 290 size_t len; 291 C[STACK_LEN] stackBuf; 292 C[] buf; 293 bool useStackBuf; 294 alias pay = typeof(this); // to access fields through pay.xx too 295 } 296 297 ~this() pure @trusted 298 { 299 import core.memory : pureFree; 300 if (buf) pureFree(buf.ptr); 301 } 302 303 @disable this(this); 304 305 // constructor used by move 306 private this(C[] sbuf, C[] buf, size_t len) 307 { 308 this.stackBuf[0..sbuf.length] = sbuf[]; 309 this.buf = buf; 310 this.len = len; 311 } 312 313 StringImpl move() scope @trusted 314 { 315 import std.algorithm : min; 316 auto obuf = buf; 317 auto olen = len; 318 buf = null; 319 len = 0; 320 return StringImpl(stackBuf[0..min(STACK_LEN, olen)], obuf, olen); 321 } 322 323 /// 324 StringImpl clone() scope @trusted 325 { 326 return StringImpl(this[]); 327 } 328 } 329 330 /** 331 * Constructor for cases when we know prior to the creation total length of the future string. 332 * It preallocates internal buffer with `initialSize`. 333 */ 334 this(size_t initialSize) pure 335 { 336 static if (rc) pay = heapAlloc!Payload(1, 0); 337 immutable len = initialSize + Z; 338 static if (!rc) { 339 if (len <= STACK_LEN) return; // we can use stack buffer for that 340 } 341 pay.buf = () @trusted { return (cast(C*)enforceMalloc(len * C.sizeof))[0..len]; }(); 342 } 343 344 this(S)(auto ref scope S str) 345 { 346 put(str); 347 } 348 349 /** 350 * Creates RCString from the provided arguments formated to string with nogcFormatter 351 */ 352 static StringImpl from(ARGS...)(auto ref ARGS args) 353 { 354 import bc.string.format : getFormatSize, nogcFormatTo; 355 356 size_t total; 357 // calculate total size needed so we don't have to reallocate 358 static foreach (a; args) total += getFormatSize(a); 359 360 // and format arguments to RCString 361 auto ret = StringImpl(total); 362 static foreach (a; args) ret.nogcFormatTo(a); 363 return ret; 364 } 365 366 alias data this; 367 368 /** 369 * Access internal string including the reserved block if any. 370 */ 371 @property inout(C)[] data() pure inout 372 { 373 if (!length) return null; 374 375 static if (!rc) { 376 if (len + Z <= STACK_LEN) return stackBuf[0..len]; 377 } 378 379 assert(pay.buf); 380 return pay.buf[0..pay.len]; 381 } 382 383 static if (zero) 384 { 385 /// Pointer to string data that can be directly used in a C functions expecting '\0' terminal char. 386 @property inout(C*) ptr() pure inout @trusted 387 { 388 if (!length) return null; 389 static if (!rc) { 390 if (len + Z <= STACK_LEN) return stackBuf.ptr; 391 } 392 return pay.buf.ptr; 393 } 394 } 395 396 /// Slicing support for the internal buffer data 397 @property inout(C)[] opSlice() pure inout 398 { 399 return this.data; 400 } 401 402 /// ditto 403 @property inout(C)[] opSlice(size_t start, size_t end) pure inout 404 { 405 if (start > length || end > length) assert(0, "Index out of bounds"); 406 if (start > end) assert(0, "Invalid slice indexes"); 407 return this.data[start .. end]; 408 } 409 410 /// Indexed access to the buffer data 411 @property ref C opIndex(size_t idx) pure return 412 { 413 if (idx >= length) assert(0, "Index out of bounds"); 414 return this.data[idx]; 415 } 416 417 /// opDollar implementation 418 alias length opDollar; 419 420 /// Managed string length 421 @property size_t length() pure const 422 { 423 static if (rc) 424 return pay ? pay.len : 0; 425 else 426 return len; 427 } 428 429 /// Returns: capacity that can be used without reallocation 430 size_t capacity() pure const 431 { 432 static if (rc) 433 return pay ? (pay.buf.length - pay.len - Z) : 0; 434 else 435 return (buf ? buf.length : STACK_LEN) - pay.len - Z; 436 } 437 438 /** 439 * Reserves space for requested number of characters that also increments string length. 440 * This can be used for example in cases when we need to fill slice of string with some known length data. 441 * To return reserved data, use `dropBack`. 442 */ 443 void reserve(size_t sz) 444 { 445 ensureAvail(sz); 446 pay.len += sz; 447 } 448 449 /** 450 * Drops defined amount of characters from the back. 451 */ 452 void dropBack(size_t sz) 453 { 454 assert(length >= sz, "Not enough data"); 455 if (!sz) return; 456 457 static if (!rc) 458 { 459 if (len + Z > STACK_LEN && len + Z - sz <= STACK_LEN) 460 { 461 // switch from heap buffer back to stack one 462 len -= sz; 463 stackBuf[0..len] = buf[0..len]; 464 static if (zero) stackBuf[len] = 0; 465 return; 466 } 467 } 468 pay.len -= sz; 469 static if (zero) pay.buf[pay.len] = 0; 470 } 471 472 /** 473 * Clears content of the data, but keeps internal buffer as is so it can be used to build another string. 474 */ 475 void clear() pure 476 { 477 static if (rc) { 478 if (pay) pay.len = 0; 479 } 480 else len = 0; 481 } 482 483 alias opOpAssign(string op : "~") = put; 484 485 void opAssign(S)(auto ref scope S str) 486 if (isAcceptableString!S || is(Unqual!S == C)) 487 { 488 clear(); 489 put(str); 490 } 491 492 void put(in C val) pure 493 { 494 static if (!rc) 495 { 496 if (len + 1 + Z <= STACK_LEN) 497 { 498 stackBuf[len++] = val; 499 static if (zero) stackBuf[len] = 0; 500 return; 501 } 502 } 503 ensureAvail(1); 504 pay.buf[pay.len++] = val; 505 static if (zero) pay.buf[pay.len] = 0; 506 } 507 508 void put(S)(auto ref scope S str) if (isAcceptableString!S) 509 { 510 alias CF = Unqual!(ElementEncodingType!S); 511 512 static if (C.sizeof == CF.sizeof && is(typeof(pay.buf[0 .. str.length] = str[]))) 513 { 514 static if (!rc) 515 { 516 if (len + str.length + Z <= STACK_LEN) 517 { 518 stackBuf[len .. len + str.length] = str[]; 519 len += str.length; 520 static if (zero) stackBuf[len] = 0; 521 return; 522 } 523 } 524 525 ensureAvail(str.length); 526 pay.buf[pay.len .. pay.len + str.length] = str[]; 527 pay.len += str.length; 528 static if (zero) pay.buf[pay.len] = 0; 529 } 530 else 531 { 532 // copy range 533 534 // special case when we can determine that it still fits to stack buffer 535 static if (!rc && hasLength!S && is(C == CF)) 536 { 537 if (pay.len + Z <= STACK_LEN) 538 { 539 foreach (ch; r.byUTF!(Unqual!C)) 540 { 541 stackBuf[pay.len++] = ch; 542 static if (zero) stackBuf[pay.dlen] = 0; 543 } 544 return; 545 } 546 } 547 548 static if (!rc) size_t nlen = pay.len; 549 static if (hasLength!S) { 550 ensureAvail(str.length); 551 static if (!rc) nlen += str.length; 552 } 553 import bc.internal.utf : byUTF; 554 static if (isSomeString!S) 555 auto r = cast(const(CF)[])str; // because inout(CF) causes problems with byUTF 556 else 557 alias r = str; 558 559 foreach (ch; r.byUTF!(Unqual!C)) 560 { 561 static if (!hasLength!S || !is(C == CF)) 562 { 563 ensureAvail(1); 564 static if (!rc) { 565 static if (!hasLength!S) nlen++; 566 else { 567 if (pay.len == nlen) nlen++; 568 } 569 } 570 } 571 static if (!rc) 572 { 573 if (nlen + Z + 1 <= STACK_LEN) // we can still use stack buffer 574 { 575 stackBuf[len++] = ch; 576 continue; 577 } 578 } 579 pay.buf[pay.len++] = ch; 580 } 581 static if (zero) pay.buf[pay.len] = 0; 582 static if (!rc) assert(nlen == pay.len); 583 } 584 } 585 586 private void ensureAvail(size_t sz) pure 587 { 588 static if (__VERSION__ >= 2094) pragma(inline, true); 589 else pragma(inline); 590 import core.bitop : bsr; 591 import std.algorithm : max, min; 592 593 static if (rc) 594 { 595 if (!pay) 596 { 597 // allocate new payload with required size 598 pay = heapAlloc!Payload(1, 0); 599 immutable l = max(sz+Z, 64); // allocates at leas 64B 600 pay.buf = () @trusted { return (cast(C*)enforceMalloc(l * C.sizeof))[0..l]; }(); 601 return; 602 } 603 604 if (pay.len + sz + Z <= pay.buf.length) return; // we can fit in what we've already allocated 605 } 606 else 607 { 608 if (len + sz + Z <= STACK_LEN) return; // still fits to stack buffer 609 if (buf is null) 610 { 611 immutable l = max(len + sz + Z, STACK_LEN + 64); // allocates at leas 64B over 612 buf = () @trusted { return (cast(C*)enforceMalloc(l * C.sizeof))[0..l]; }(); 613 buf[0..len] = stackBuf[0..len]; // copy data from stack buffer, we'll use heap allocated one from now 614 return; 615 } 616 if (len + Z <= STACK_LEN) 617 { 618 // some buffer is already preallocated, but we're still on stackBuffer and need to move to heap allocated one 619 assert(buf.length > STACK_LEN); 620 buf[0..len] = stackBuf[0..len]; // copy current data from the stack 621 } 622 623 if (len + sz + Z <= buf.length) return; // we can fit in what we've already allocated 624 } 625 626 // reallocate buffer 627 // Note: new length calculation taken from std.array.appenderNewCapacity 628 immutable ulong mult = 100 + (1000UL) / (bsr((pay.len + sz + Z)) + 1); 629 immutable l = cast(size_t)(((pay.len + sz + Z) * min(mult, 200) + 99) / 100); 630 // debug printf("realloc %lu -> %lu\n", pay.len, l); 631 pay.buf = () @trusted { return (cast(C*)enforceRealloc(pay.buf.ptr, l * C.sizeof))[0..l]; }(); 632 } 633 } 634 635 auto rcString(C = char, S)(auto ref S str) 636 { 637 StringImpl!(C, RC.yes, Zero.no) ret; 638 ret.put(str); 639 return ret; 640 } 641 642 @("RCString") 643 @system @nogc unittest 644 { 645 import bc.internal.utf : byCodeUnit; 646 import std.algorithm : filter; 647 648 RCStringZ s; 649 s ~= "fo"; 650 assert(s.pay.len == 2); 651 assert(s.pay.buf.length >= 3); 652 653 s ~= 'o'; 654 assert(s.pay.len == 3); 655 assert(s.pay.buf.length >= 4); 656 657 s ~= "bar"; 658 assert(s.pay.len == 6); 659 assert(s.pay.buf.length >= 7); 660 assert(s == "foobar"); 661 662 s ~= "baz".byCodeUnit.filter!(a => a == 'z'); 663 assert(s.length == "foobarz".length); 664 assert(s.data == "foobarz"); 665 assert(s == "foobarz"); 666 assert(s.ptr == &s.data[0]); 667 assert((s.ptr["foobarz".length]) == 0); 668 } 669 670 @("RCString.from") 671 @nogc @safe unittest 672 { 673 { 674 auto str = RCString.from("foo", 42, "bar"); 675 assert(str == "foo42bar"); 676 } 677 678 { 679 auto str = RCWString.from("foo"); 680 assert(str == "foo"w); 681 } 682 } 683 684 version (D_Exceptions) 685 { 686 @("RCString with Nullable") 687 @nogc @safe unittest 688 { 689 import std.typecons : Nullable; 690 Nullable!RCString sn = RCString("foo"); 691 } 692 } 693 694 @("rcString") 695 @nogc @safe unittest 696 { 697 auto str = "foo".rcString(); 698 assert(str == "foo"); 699 } 700 701 @("String") 702 @nogc @safe unittest 703 { 704 auto s = String("Hello"); 705 assert(s.capacity == String.stackBuf.length - 5); 706 assert(s[] == "Hello"); 707 s ~= " String"; 708 assert(s[] == "Hello String"); 709 auto s2 = s.clone(); 710 assert(s[] == s2[]); 711 () @trusted { assert(s.ptr != s2.ptr); }(); 712 713 auto s3 = s.move(); 714 assert(s.buf is null); 715 assert(s.len == 0); 716 assert(s3 == "Hello String"); 717 } 718 719 @("String - put static array") 720 @nogc @safe unittest 721 { 722 String s; 723 immutable(char)[3] foo = "foo"; 724 s ~= foo; 725 assert(s == "foo"); 726 } 727 728 @("String stack to heap") 729 @nogc @safe unittest 730 { 731 import std.algorithm : each; 732 import std.range : repeat; 733 734 StringZ s; 735 'a'.repeat(s.stackBuf.length-1).each!(c => s.put(c)); 736 assert(s.length == s.stackBuf.length-1); 737 assert(s.stackBuf[$-2] == 'a'); 738 assert(s.stackBuf[$-1] == '\0'); 739 assert(s.buf is null); 740 assert(&s.data[0] == &s.stackBuf[0]); 741 s ~= 'b'; 742 assert(s.stackBuf[$-1] == '\0'); // doesn't change on stack to heap switch 743 assert(s.buf !is null); 744 assert(&s.data[0] == &s.buf[0]); 745 assert(s.buf[s.stackBuf.length-1] == 'b'); 746 s ~= "foo"; 747 748 s.clear(); 749 s ~= 'c'; 750 assert(&s.data[0] == &s.stackBuf[0]); // back to stack usage 751 assert(s.buf !is null); // but heap buffer is still there 752 'd'.repeat(s.stackBuf.length).each!(c => s.put(c)); 753 assert(&s.data[0] == &s.buf[0]); 754 assert(s.length == 1 + s.stackBuf.length); 755 assert(s.buf[1 + s.stackBuf.length] == '\0'); 756 } 757 758 @("String reserve") 759 @nogc @safe unittest 760 { 761 String buf; 762 assert(buf.length == 0); 763 assert(buf.capacity == buf.stackBuf.length); 764 buf.reserve(64); 765 assert(buf.length == 64); 766 assert(buf.buf is null); 767 buf[][0..3] = "foo"; 768 buf.dropBack(61); 769 assert(buf[] == "foo"); 770 buf.reserve(buf.stackBuf.length); 771 assert(buf.buf !is null); 772 assert(buf.buf[0..3] == "foo"); 773 buf.buf[0..3] = "bar"; 774 buf.dropBack(buf.stackBuf.length); 775 assert(buf.buf !is null); // left allocated for reuse 776 assert(buf.stackBuf[0..3] == "bar"); // copy from heap 777 } 778 779 private C[] trustedRealloc(C)(scope C[] buf, size_t strLength, bool bufIsOnStack) 780 @trusted @nogc pure nothrow 781 { 782 pragma(inline, false); // because it's rarely called 783 784 import bc.core.memory : enforceMalloc, enforceRealloc; 785 786 size_t newlen = buf.length * 3 / 2; 787 788 if (bufIsOnStack) 789 { 790 if (newlen <= strLength) 791 newlen = strLength + 1; // +1 for terminating 0 792 auto ptr = cast(C*) enforceMalloc(newlen * C.sizeof); 793 ptr[0 .. buf.length] = buf[]; 794 return ptr[0 .. newlen]; 795 } 796 else 797 { 798 if (buf.length >= size_t.max / (2 * C.sizeof)) 799 { 800 version (D_Exceptions) 801 { 802 import core.exception : onOutOfMemoryError; 803 onOutOfMemoryError(); 804 } 805 else assert(0, "Memory allocation failed"); 806 } 807 auto ptr = cast(C*) enforceRealloc(buf.ptr, newlen * C.sizeof); 808 return ptr[0 .. newlen]; 809 } 810 } 811 812 /// Strips leading whitespace ('\t', '\n', '\r', ' ') 813 S stripLeft(S)(S str) 814 { 815 pragma(inline, true); 816 /// All chars except for whitespace ('\t', '\n', '\r', ' ') 817 enum AllExceptWhitespaceRanges = "\0\10\13\14\16\37\41\377"; 818 819 size_t rpos; 820 immutable rs = parseToken!(AllExceptWhitespaceRanges, '"')(str, rpos); 821 if(rs == -1) return str[$..$]; // Only whitespace string, return empty range. 822 else return str[rpos..$]; 823 } 824 825 @("stripLeft") 826 unittest 827 { 828 assert(stripLeft("\t\n\r foobar\t\n\r ") == "foobar\t\n\r "); 829 assert(stripLeft("\t\n\r\t\n\r ") == ""); 830 } 831 832 bool startsWith(S, char[] chars)(S str) 833 { 834 enum validCharMap = buildValidCharMap(chars, true); 835 return validCharMap[str[0]]; 836 } 837 838 @("startsWith") 839 unittest 840 { 841 assert(startsWith!(string, ['+', '-'])("-42")); 842 assert(startsWith!(string, ['+', '-'])("+42")); 843 assert(!startsWith!(string, ['+', '-'])("42")); 844 } 845 846 /** 847 * Alternative implementation of `std.string.outdent` that differs in: 848 * 849 * * meant for dedent string literals in CT 850 * * if first line is not indented, other lines are dedented still (std.string.outdent returns original text in that case) 851 * * empty lines at the text start are removed 852 */ 853 template dedent(alias str) 854 { 855 static S getLine(S)(S str) 856 { 857 if (!str.length) return null; 858 for (size_t i = 0; i < str.length; ++i) 859 { 860 if (str[i] == '\r') 861 { 862 if (i+1 < str.length && str[i+1] == '\n') 863 return str[0..i+2]; 864 } 865 if (str[i] == '\n') return str[0..i+1]; 866 } 867 return str; 868 } 869 870 // strip line whitespace but keep newline characters 871 static S stripWS(S)(S str) 872 { 873 if (!str.length) return null; 874 for (size_t i = 0; i < str.length; ++i) 875 { 876 if (str[i] <= ' ' && str[i] != '\r' && str[i] != '\n') continue; 877 return str[i..$]; 878 } 879 return null; 880 } 881 882 template shortestIndent(alias str, size_t prev = size_t.max) 883 { 884 enum line = getLine(str); 885 enum stripped = stripWS(line); 886 static if (line.length == 0) enum shortestIndent = prev; 887 else static if (line.length == stripped.length) enum shortestIndent = 0; 888 else 889 { 890 enum cur = prev > line.length - stripped.length ? line.length - stripped.length : prev; 891 enum next = shortestIndent!(str[line.length..$], cur); 892 enum shortestIndent = cur > next ? next : cur; 893 } 894 } 895 896 template dedentNext(alias str, size_t indent) 897 { 898 enum ln = getLine(str); 899 static if (!ln.length) 900 enum dedentNext = null; 901 else static if (ln.length < indent) 902 enum dedentNext = ln ~ dedentNext!(str[ln.length..$], indent); 903 else 904 enum dedentNext = ln[indent..$] ~ dedentNext!(str[ln.length..$], indent); 905 } 906 907 enum line = getLine(str); 908 enum stripped = stripWS(line); 909 910 static if (!line.length) enum dedent = null; 911 else static if ( 912 (stripped.length == 1 && stripped[0] == '\n') 913 || (stripped.length == 2 && stripped[0] == '\r' && stripped[1] == '\n')) 914 enum dedent = dedent!(str[line.length..$]); // drop first empty lines 915 else 916 { 917 // ignore no indentation of the first line 918 enum shortest = shortestIndent!( 919 str[line.length..$], 920 stripped.length == line.length ? size_t.max : (line.length - stripped.length)); 921 922 static if (shortest == 0) 923 enum dedent = str; // no indent used 924 else 925 enum dedent = stripped ~ dedentNext!(str[line.length..$], shortest); 926 } 927 } 928 929 @("dedent") 930 unittest 931 { 932 // with empty first line 933 { 934 enum str1 = ` 935 DELETE FROM elements.element 936 WHERE id=ANY($1) AND type_id IN ( 937 SELECT id FROM elements.element_type WHERE owner=$2 938 )`; 939 940 enum str2 = 941 "DELETE FROM elements.element\n" ~ 942 "WHERE id=ANY($1) AND type_id IN (\n" ~ 943 " SELECT id FROM elements.element_type WHERE owner=$2\n" ~ 944 ")"; 945 946 static assert(dedent!str1 == str2); 947 } 948 949 // with not indented first line 950 { 951 enum str1 = `DELETE FROM elements.element 952 WHERE id=ANY($1) AND type_id IN ( 953 SELECT id FROM elements.element_type WHERE owner=$2 954 )`; 955 956 enum str2 = "DELETE FROM elements.element\n" ~ 957 "WHERE id=ANY($1) AND type_id IN (\n" ~ 958 " SELECT id FROM elements.element_type WHERE owner=$2\n" ~ 959 ")"; 960 961 static assert(dedent!str1 == str2); 962 } 963 964 // test that we didn't touch number of lines 965 { 966 static assert(dedent!` 967 2 968 3 969 ` == "2\n3\n"); // first line is dropped, last newline is kept 970 } 971 972 // test we don't dedent when some line is not indented 973 { 974 enum str = `aa 975 bb 976 cc`; 977 assert(dedent!str == str); 978 } 979 980 // test that we don't touch space after last line text 981 { 982 assert(dedent!" foo " == "foo "); 983 assert(dedent!`foo 984 bar ` == "foo\nbar "); 985 } 986 } 987 988 /** 989 * Builds char map from the provided ranges. 990 * 991 * Params: 992 * ranges = ranges of ascii characters. 993 * valid = wheteher range characters are valid or not. 994 * For example: 995 * buildValidCharMap("\0/:\xff", false) means that only characters 0-9 would have true in the generated map. 996 * buildValidCharMap("\0/:\xff", true) means that all characters except 0-9 would have true in the generated map. 997 * 998 * Returns: generated table 999 */ 1000 bool[256] buildValidCharMap(S)(S ranges, bool valid = false) 1001 { 1002 assert(ranges.length % 2 == 0, "Uneven ranges"); 1003 bool[256] res = valid ? false : true; 1004 1005 for (int i=0; i < ranges.length; i+=2) 1006 for (int j=ranges[i]; j <= ranges[i+1]; ++j) 1007 res[j] = valid ? true : false; 1008 return res; 1009 } 1010 1011 /// 1012 @("buildValidCharMap") 1013 @safe unittest 1014 { 1015 string ranges = "\0 \"\"(),,//:@[]{{}}\x7f\xff"; 1016 assert(buildValidCharMap(ranges, false) == 1017 cast(bool[])[ 1018 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1019 0,1,0,1,1,1,1,1,0,0,1,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, 1020 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1, 1021 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,0, 1022 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1023 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1024 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1025 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1026 ]); 1027 1028 assert(buildValidCharMap(ranges, true) == 1029 cast(bool[])[ 1030 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1031 1,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1, 1032 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0, 1033 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1, 1034 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1035 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1036 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1037 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1038 ]); 1039 } 1040 1041 /* 1042 * Advances index over the token to the next character while checking for valid characters. 1043 * On success, buffer index is left on the next character. 1044 * 1045 * Params: 1046 * - ranges = ranges of characters to stop on 1047 * - next = next character/s to stop on (must be present in the provided ranges too) 1048 * - sseRanges = 1049 * as SSE optimized path is limited to 8 pairs, here one can provide merged ranges for a fast 1050 * SSE path that would be precised with `ranges`. Otherwise `ranges` is used for SSE path too. 1051 * 1052 * Returns: 1053 * * 0 on success 1054 * * -1 when token hasn't been found (ie not enough data in the buffer) 1055 * * -2 when character from invalid ranges was found but not matching one of next characters (ie invalid token) 1056 */ 1057 int parseToken(string ranges, alias next, string sseRanges = null, C)(const(C)[] buffer, ref size_t i) pure 1058 if (is(C == ubyte) || is(C == char)) 1059 { 1060 version (DigitalMars) { 1061 static if (__VERSION__ >= 2094) pragma(inline, true); // older compilers can't inline this 1062 } else pragma(inline, true); 1063 1064 immutable charMap = parseTokenCharMap!(ranges)(); 1065 1066 static if (LDC_with_SSE42) 1067 { 1068 // CT function to prepare input for SIMD vector enum 1069 static byte[16] padRanges()(string ranges) 1070 { 1071 byte[16] res; 1072 // res[0..ranges.length] = cast(byte[])ranges[]; - broken on macOS betterC tests 1073 foreach (i, c; ranges) res[i] = cast(byte)c; 1074 return res; 1075 } 1076 1077 static if (sseRanges) alias usedRng = sseRanges; 1078 else alias usedRng = ranges; 1079 static assert(usedRng.length <= 16, "Ranges must be at most 16 characters long"); 1080 static assert(usedRng.length % 2 == 0, "Ranges must have even number of characters"); 1081 enum rangesSize = usedRng.length; 1082 enum byte16 rngE = padRanges(usedRng); 1083 1084 if (_expect(buffer.length - i >= 16, true)) 1085 { 1086 size_t left = (buffer.length - i) & ~15; // round down to multiple of 16 1087 byte16 ranges16 = rngE; 1088 1089 do 1090 { 1091 byte16 b16 = () @trusted { return cast(byte16)_mm_loadu_si128(cast(__m128i*)&buffer[i]); }(); 1092 immutable r = _mm_cmpestri( 1093 ranges16, rangesSize, 1094 b16, 16, 1095 _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | _SIDD_UBYTE_OPS 1096 ); 1097 1098 if (r != 16) 1099 { 1100 i += r; 1101 goto FOUND; 1102 } 1103 i += 16; 1104 left -= 16; 1105 } 1106 while (_expect(left != 0, true)); 1107 } 1108 } 1109 else 1110 { 1111 // faster unrolled loop to iterate over 8 characters 1112 loop: while (_expect(buffer.length - i >= 8, true)) 1113 { 1114 static foreach (_; 0..8) 1115 { 1116 if (_expect(!charMap[buffer[i]], false)) goto FOUND; 1117 ++i; 1118 } 1119 } 1120 } 1121 1122 // handle the rest 1123 if (_expect(i >= buffer.length, false)) return -1; 1124 1125 FOUND: 1126 while (true) 1127 { 1128 static if (is(typeof(next) == char)) { 1129 static assert(!charMap[next], "Next character is not in ranges"); 1130 if (buffer[i] == next) return 0; 1131 } else { 1132 static assert(next.length > 0, "Next character not provided"); 1133 static foreach (c; next) { 1134 static assert(!charMap[c], "Next character is not in ranges"); 1135 if (buffer[i] == c) return 0; 1136 } 1137 } 1138 if (_expect(!charMap[buffer[i]], false)) return -2; 1139 if (_expect(++i == buffer.length, false)) return -1; 1140 } 1141 } 1142 1143 /// 1144 @("parseToken") 1145 @safe unittest 1146 { 1147 size_t idx; 1148 string buf = "foo\nbar"; 1149 auto ret = parseToken!("\0\037\177\377", "\r\n")(buf, idx); 1150 assert(ret == 0); // no error 1151 assert(idx == 3); // index of newline character 1152 1153 idx = 0; 1154 ret = parseToken!("\0\037\177\377", "\r\n")(buf[0..3], idx); 1155 assert(ret == -1); // not enough data to find next character 1156 assert(idx == 3); 1157 1158 idx = 0; 1159 buf = "foo\t\nbar"; 1160 ret = parseToken!("\0\037\177\377", "\r\n")(buf, idx); 1161 assert(ret == -2); // invalid character '\t' found in token 1162 assert(idx == 3); // invalid character on index 3 1163 } 1164 1165 private immutable(bool[256]) parseTokenCharMap(string invalidRanges)() { 1166 static immutable charMap = buildValidCharMap(invalidRanges); 1167 return charMap; 1168 }