vibe-core/source/vibe/internal/string.d
Sönke Ludwig 2e4bc6a316 Improve performance of sanitizeUTF8/readFileUTF8.
Uses std.encoding.sanitize, which returns the original string, if all code points are properly encoded. Note that the performance could still be improved considerably by iterating over multiple bytes at once, fast skipping over characters that don't have the most significant bit set.
2019-10-22 11:40:23 +02:00

232 lines
5.5 KiB
D

/**
Utility functions for string processing
Copyright: © 2012-2014 RejectedSoftware e.K.
License: Subject to the terms of the MIT license, as written in the included LICENSE.txt file.
Authors: Sönke Ludwig
*/
module vibe.internal.string;
public import std.string;
import vibe.internal.array;
import vibe.internal.allocator;
import std.algorithm;
import std.array;
import std.ascii;
import std.format;
import std.uni;
import std.utf;
import core.exception;
/**
Takes a string with possibly invalid UTF8 sequences and outputs a valid UTF8 string as near to
the original as possible.
*/
string sanitizeUTF8(immutable(ubyte)[] str)
@safe pure {
import std.encoding : sanitize;
auto ustr = cast(immutable(char)[])str;
return () @trusted { return sanitize(ustr); } ();
}
/// ditto
string sanitizeUTF8(in ubyte[] str)
@trusted pure {
import std.encoding : sanitize;
auto ustr = cast(immutable(char)[])str;
auto ret = sanitize(ustr);
if (ret.ptr is ustr.ptr) return ustr.idup;
else return ret;
}
/**
Strips the byte order mark of an UTF8 encoded string.
This is useful when the string is coming from a file.
*/
string stripUTF8Bom(string str)
@safe pure nothrow {
if (str.length >= 3 && str[0 .. 3] == [0xEF, 0xBB, 0xBF])
return str[3 ..$];
return str;
}
/**
Checks if all characters in 'str' are contained in 'chars'.
*/
bool allOf(string str, string chars)
@safe pure {
foreach (dchar ch; str)
if (!chars.canFind(ch))
return false;
return true;
}
ptrdiff_t indexOfCT(Char)(in Char[] s, dchar c, CaseSensitive cs = CaseSensitive.yes)
@safe pure {
if (__ctfe) {
if (cs == CaseSensitive.yes) {
foreach (i, dchar ch; s)
if (ch == c)
return i;
} else {
c = std.uni.toLower(c);
foreach (i, dchar ch; s)
if (std.uni.toLower(ch) == c)
return i;
}
return -1;
} else return std.string.indexOf(s, c, cs);
}
ptrdiff_t indexOfCT(Char)(in Char[] s, in Char[] needle)
{
if (__ctfe) {
if (s.length < needle.length) return -1;
foreach (i; 0 .. s.length - needle.length)
if (s[i .. i+needle.length] == needle)
return i;
return -1;
} else return std.string.indexOf(s, needle);
}
/**
Checks if any character in 'str' is contained in 'chars'.
*/
bool anyOf(string str, string chars)
@safe pure {
foreach (ch; str)
if (chars.canFind(ch))
return true;
return false;
}
/// ASCII whitespace trimming (space and tab)
string stripLeftA(string s)
@safe pure nothrow {
while (s.length > 0 && (s[0] == ' ' || s[0] == '\t'))
s = s[1 .. $];
return s;
}
/// ASCII whitespace trimming (space and tab)
string stripRightA(string s)
@safe pure nothrow {
while (s.length > 0 && (s[$-1] == ' ' || s[$-1] == '\t'))
s = s[0 .. $-1];
return s;
}
/// ASCII whitespace trimming (space and tab)
string stripA(string s)
@safe pure nothrow {
return stripLeftA(stripRightA(s));
}
/// Finds the first occurence of any of the characters in `chars`
ptrdiff_t indexOfAny(string str, string chars)
@safe pure {
foreach (i, char ch; str)
if (chars.canFind(ch))
return i;
return -1;
}
alias countUntilAny = indexOfAny;
/**
Finds the closing bracket (works with any of '[', '$(LPAREN)', '<', '{').
Params:
str = input string
nested = whether to skip nested brackets
Returns:
The index of the closing bracket or -1 for unbalanced strings
and strings that don't start with a bracket.
*/
ptrdiff_t matchBracket(string str, bool nested = true)
@safe pure nothrow {
if (str.length < 2) return -1;
char open = str[0], close = void;
switch (str[0]) {
case '[': close = ']'; break;
case '(': close = ')'; break;
case '<': close = '>'; break;
case '{': close = '}'; break;
default: return -1;
}
size_t level = 1;
foreach (i, char c; str[1 .. $]) {
if (nested && c == open) ++level;
else if (c == close) --level;
if (level == 0) return i + 1;
}
return -1;
}
@safe unittest
{
static struct Test { string str; ptrdiff_t res; }
enum tests = [
Test("[foo]", 4), Test("<bar>", 4), Test("{baz}", 4),
Test("[", -1), Test("[foo", -1), Test("ab[f]", -1),
Test("[foo[bar]]", 9), Test("[foo{bar]]", 8),
];
foreach (test; tests)
assert(matchBracket(test.str) == test.res);
assert(matchBracket("[foo[bar]]", false) == 8);
static assert(matchBracket("[foo]") == 4);
}
/// Same as std.string.format, just using an allocator.
string formatAlloc(ARGS...)(IAllocator alloc, string fmt, ARGS args)
{
auto app = AllocAppender!string(alloc);
formattedWrite(&app, fmt, args);
return app.data;
}
/// Special version of icmp() with optimization for ASCII characters
int icmp2(string a, string b)
@safe pure {
size_t i = 0, j = 0;
// fast skip equal prefix
size_t min_len = min(a.length, b.length);
while( i < min_len && a[i] == b[i] ) i++;
if( i > 0 && (a[i-1] & 0x80) ) i--; // don't stop half-way in a UTF-8 sequence
j = i;
// compare the differing character and the rest of the string
while(i < a.length && j < b.length){
uint ac = cast(uint)a[i];
uint bc = cast(uint)b[j];
if( !((ac | bc) & 0x80) ){
i++;
j++;
if( ac >= 'A' && ac <= 'Z' ) ac += 'a' - 'A';
if( bc >= 'A' && bc <= 'Z' ) bc += 'a' - 'A';
if( ac < bc ) return -1;
else if( ac > bc ) return 1;
} else {
dchar acp = decode(a, i);
dchar bcp = decode(b, j);
if( acp != bcp ){
acp = std.uni.toLower(acp);
bcp = std.uni.toLower(bcp);
if( acp < bcp ) return -1;
else if( acp > bcp ) return 1;
}
}
}
if( i < a.length ) return 1;
else if( j < b.length ) return -1;
assert(i == a.length || j == b.length, "Strings equal but we didn't fully compare them!?");
return 0;
}