2016-03-01 19:30:42 +00:00
|
|
|
/**
|
|
|
|
Utility functions for string processing
|
|
|
|
|
2020-01-27 18:20:52 +00:00
|
|
|
Copyright: © 2012-2014 Sönke Ludwig
|
2016-03-01 19:30:42 +00:00
|
|
|
License: Subject to the terms of the MIT license, as written in the included LICENSE.txt file.
|
|
|
|
Authors: Sönke Ludwig
|
|
|
|
*/
|
|
|
|
module vibe.internal.string;
|
|
|
|
|
|
|
|
public import std.string;
|
|
|
|
|
|
|
|
import vibe.internal.array;
|
2016-11-08 14:32:25 +00:00
|
|
|
import vibe.internal.allocator;
|
2016-03-01 19:30:42 +00:00
|
|
|
|
|
|
|
import std.algorithm;
|
|
|
|
import std.array;
|
|
|
|
import std.ascii;
|
|
|
|
import std.format;
|
|
|
|
import std.uni;
|
|
|
|
import std.utf;
|
|
|
|
import core.exception;
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
Takes a string with possibly invalid UTF8 sequences and outputs a valid UTF8 string as near to
|
|
|
|
the original as possible.
|
|
|
|
*/
|
2019-10-22 09:33:57 +00:00
|
|
|
string sanitizeUTF8(immutable(ubyte)[] str)
|
2016-03-01 19:30:42 +00:00
|
|
|
@safe pure {
|
2019-10-22 09:33:57 +00:00
|
|
|
import std.encoding : sanitize;
|
|
|
|
auto ustr = cast(immutable(char)[])str;
|
|
|
|
return () @trusted { return sanitize(ustr); } ();
|
|
|
|
}
|
|
|
|
/// ditto
|
|
|
|
string sanitizeUTF8(in ubyte[] str)
|
|
|
|
@trusted pure {
|
|
|
|
import std.encoding : sanitize;
|
|
|
|
auto ustr = cast(immutable(char)[])str;
|
|
|
|
auto ret = sanitize(ustr);
|
|
|
|
if (ret.ptr is ustr.ptr) return ustr.idup;
|
|
|
|
else return ret;
|
2016-03-01 19:30:42 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
Strips the byte order mark of an UTF8 encoded string.
|
|
|
|
This is useful when the string is coming from a file.
|
|
|
|
*/
|
|
|
|
string stripUTF8Bom(string str)
|
|
|
|
@safe pure nothrow {
|
|
|
|
if (str.length >= 3 && str[0 .. 3] == [0xEF, 0xBB, 0xBF])
|
|
|
|
return str[3 ..$];
|
|
|
|
return str;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
Checks if all characters in 'str' are contained in 'chars'.
|
|
|
|
*/
|
|
|
|
bool allOf(string str, string chars)
|
|
|
|
@safe pure {
|
|
|
|
foreach (dchar ch; str)
|
|
|
|
if (!chars.canFind(ch))
|
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
ptrdiff_t indexOfCT(Char)(in Char[] s, dchar c, CaseSensitive cs = CaseSensitive.yes)
|
|
|
|
@safe pure {
|
|
|
|
if (__ctfe) {
|
|
|
|
if (cs == CaseSensitive.yes) {
|
|
|
|
foreach (i, dchar ch; s)
|
|
|
|
if (ch == c)
|
|
|
|
return i;
|
|
|
|
} else {
|
|
|
|
c = std.uni.toLower(c);
|
|
|
|
foreach (i, dchar ch; s)
|
|
|
|
if (std.uni.toLower(ch) == c)
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
return -1;
|
|
|
|
} else return std.string.indexOf(s, c, cs);
|
|
|
|
}
|
|
|
|
ptrdiff_t indexOfCT(Char)(in Char[] s, in Char[] needle)
|
|
|
|
{
|
|
|
|
if (__ctfe) {
|
|
|
|
if (s.length < needle.length) return -1;
|
|
|
|
foreach (i; 0 .. s.length - needle.length)
|
|
|
|
if (s[i .. i+needle.length] == needle)
|
|
|
|
return i;
|
|
|
|
return -1;
|
|
|
|
} else return std.string.indexOf(s, needle);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
Checks if any character in 'str' is contained in 'chars'.
|
|
|
|
*/
|
|
|
|
bool anyOf(string str, string chars)
|
|
|
|
@safe pure {
|
|
|
|
foreach (ch; str)
|
|
|
|
if (chars.canFind(ch))
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// ASCII whitespace trimming (space and tab)
|
|
|
|
string stripLeftA(string s)
|
|
|
|
@safe pure nothrow {
|
|
|
|
while (s.length > 0 && (s[0] == ' ' || s[0] == '\t'))
|
|
|
|
s = s[1 .. $];
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// ASCII whitespace trimming (space and tab)
|
|
|
|
string stripRightA(string s)
|
|
|
|
@safe pure nothrow {
|
|
|
|
while (s.length > 0 && (s[$-1] == ' ' || s[$-1] == '\t'))
|
|
|
|
s = s[0 .. $-1];
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// ASCII whitespace trimming (space and tab)
|
|
|
|
string stripA(string s)
|
|
|
|
@safe pure nothrow {
|
|
|
|
return stripLeftA(stripRightA(s));
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Finds the first occurence of any of the characters in `chars`
|
2019-08-31 03:30:50 +00:00
|
|
|
ptrdiff_t indexOfAny(string str, string chars)
|
2016-03-01 19:30:42 +00:00
|
|
|
@safe pure {
|
|
|
|
foreach (i, char ch; str)
|
|
|
|
if (chars.canFind(ch))
|
|
|
|
return i;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
alias countUntilAny = indexOfAny;
|
|
|
|
|
|
|
|
/**
|
|
|
|
Finds the closing bracket (works with any of '[', '$(LPAREN)', '<', '{').
|
|
|
|
|
|
|
|
Params:
|
|
|
|
str = input string
|
|
|
|
nested = whether to skip nested brackets
|
|
|
|
Returns:
|
|
|
|
The index of the closing bracket or -1 for unbalanced strings
|
|
|
|
and strings that don't start with a bracket.
|
|
|
|
*/
|
2019-08-31 03:30:50 +00:00
|
|
|
ptrdiff_t matchBracket(string str, bool nested = true)
|
2016-03-01 19:30:42 +00:00
|
|
|
@safe pure nothrow {
|
|
|
|
if (str.length < 2) return -1;
|
|
|
|
|
|
|
|
char open = str[0], close = void;
|
|
|
|
switch (str[0]) {
|
|
|
|
case '[': close = ']'; break;
|
|
|
|
case '(': close = ')'; break;
|
|
|
|
case '<': close = '>'; break;
|
|
|
|
case '{': close = '}'; break;
|
|
|
|
default: return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t level = 1;
|
|
|
|
foreach (i, char c; str[1 .. $]) {
|
|
|
|
if (nested && c == open) ++level;
|
|
|
|
else if (c == close) --level;
|
|
|
|
if (level == 0) return i + 1;
|
|
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
@safe unittest
|
|
|
|
{
|
2019-08-31 03:30:50 +00:00
|
|
|
static struct Test { string str; ptrdiff_t res; }
|
2016-03-01 19:30:42 +00:00
|
|
|
enum tests = [
|
|
|
|
Test("[foo]", 4), Test("<bar>", 4), Test("{baz}", 4),
|
|
|
|
Test("[", -1), Test("[foo", -1), Test("ab[f]", -1),
|
|
|
|
Test("[foo[bar]]", 9), Test("[foo{bar]]", 8),
|
|
|
|
];
|
|
|
|
foreach (test; tests)
|
|
|
|
assert(matchBracket(test.str) == test.res);
|
|
|
|
assert(matchBracket("[foo[bar]]", false) == 8);
|
|
|
|
static assert(matchBracket("[foo]") == 4);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Same as std.string.format, just using an allocator.
|
2016-11-08 14:32:25 +00:00
|
|
|
string formatAlloc(ARGS...)(IAllocator alloc, string fmt, ARGS args)
|
2016-03-01 19:30:42 +00:00
|
|
|
{
|
|
|
|
auto app = AllocAppender!string(alloc);
|
|
|
|
formattedWrite(&app, fmt, args);
|
|
|
|
return app.data;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Special version of icmp() with optimization for ASCII characters
|
|
|
|
int icmp2(string a, string b)
|
|
|
|
@safe pure {
|
|
|
|
size_t i = 0, j = 0;
|
|
|
|
|
|
|
|
// fast skip equal prefix
|
|
|
|
size_t min_len = min(a.length, b.length);
|
|
|
|
while( i < min_len && a[i] == b[i] ) i++;
|
|
|
|
if( i > 0 && (a[i-1] & 0x80) ) i--; // don't stop half-way in a UTF-8 sequence
|
|
|
|
j = i;
|
|
|
|
|
|
|
|
// compare the differing character and the rest of the string
|
|
|
|
while(i < a.length && j < b.length){
|
|
|
|
uint ac = cast(uint)a[i];
|
|
|
|
uint bc = cast(uint)b[j];
|
|
|
|
if( !((ac | bc) & 0x80) ){
|
|
|
|
i++;
|
|
|
|
j++;
|
|
|
|
if( ac >= 'A' && ac <= 'Z' ) ac += 'a' - 'A';
|
|
|
|
if( bc >= 'A' && bc <= 'Z' ) bc += 'a' - 'A';
|
|
|
|
if( ac < bc ) return -1;
|
|
|
|
else if( ac > bc ) return 1;
|
|
|
|
} else {
|
|
|
|
dchar acp = decode(a, i);
|
|
|
|
dchar bcp = decode(b, j);
|
|
|
|
if( acp != bcp ){
|
|
|
|
acp = std.uni.toLower(acp);
|
|
|
|
bcp = std.uni.toLower(bcp);
|
|
|
|
if( acp < bcp ) return -1;
|
|
|
|
else if( acp > bcp ) return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if( i < a.length ) return 1;
|
|
|
|
else if( j < b.length ) return -1;
|
|
|
|
|
|
|
|
assert(i == a.length || j == b.length, "Strings equal but we didn't fully compare them!?");
|
|
|
|
return 0;
|
|
|
|
}
|