vibe-core/source/vibe/internal/string.d

/**
	Utility functions for string processing

	Copyright: © 2012-2014 Sönke Ludwig
	License: Subject to the terms of the MIT license, as written in the included LICENSE.txt file.
	Authors: Sönke Ludwig
*/
module vibe.internal.string;

public import std.string;

import vibe.internal.array;
import vibe.internal.allocator;

import std.algorithm;
import std.array;
import std.ascii;
import std.format;
import std.uni;
import std.utf;
import core.exception;


/**
	Takes a string with possibly invalid UTF8 sequences and outputs a valid UTF8 string as near to
	the original as possible.
*/
string sanitizeUTF8(immutable(ubyte)[] str)
@safe pure {
	import std.encoding : sanitize;
	auto ustr = cast(immutable(char)[])str;
	return () @trusted { return sanitize(ustr); } ();
}
/// ditto
string sanitizeUTF8(in ubyte[] str)
@trusted pure {
	import std.encoding : sanitize;
	auto ustr = cast(immutable(char)[])str;
	auto ret = sanitize(ustr);
	if (ret.ptr is ustr.ptr) return ustr.idup;
	else return ret;
}

/**
	Strips the byte order mark of an UTF8 encoded string.
	This is useful when the string is coming from a file.
*/
string stripUTF8Bom(string str)
@safe pure nothrow {
	if (str.length >= 3 && str[0 .. 3] == [0xEF, 0xBB, 0xBF])
		return str[3 ..$];
	return str;
}


/**
	Checks if all characters in 'str' are contained in 'chars'.
*/
bool allOf(string str, string chars)
@safe pure {
	foreach (dchar ch; str)
		if (!chars.canFind(ch))
			return false;
	return true;
}

ptrdiff_t indexOfCT(Char)(in Char[] s, dchar c, CaseSensitive cs = CaseSensitive.yes)
@safe pure {
	if (__ctfe) {
		if (cs == CaseSensitive.yes) {
			foreach (i, dchar ch; s)
				if (ch == c)
					return i;
		} else {
			c = std.uni.toLower(c);
			foreach (i, dchar ch; s)
				if (std.uni.toLower(ch) == c)
					return i;
		}
		return -1;
	} else return std.string.indexOf(s, c, cs);
}
ptrdiff_t indexOfCT(Char)(in Char[] s, in Char[] needle)
{
	if (__ctfe) {
		if (s.length < needle.length) return -1;
		foreach (i; 0 .. s.length - needle.length)
			if (s[i .. i+needle.length] == needle)
				return i;
		return -1;
	} else return std.string.indexOf(s, needle);
}

/**
	Checks if any character in 'str' is contained in 'chars'.
*/
bool anyOf(string str, string chars)
@safe pure {
	foreach (ch; str)
		if (chars.canFind(ch))
			return true;
	return false;
}


/// ASCII whitespace trimming (space and tab)
string stripLeftA(string s)
@safe pure nothrow {
	while (s.length > 0 && (s[0] == ' ' || s[0] == '\t'))
		s = s[1 .. $];
	return s;
}

/// ASCII whitespace trimming (space and tab)
string stripRightA(string s)
@safe pure nothrow {
	while (s.length > 0 && (s[$-1] == ' ' || s[$-1] == '\t'))
		s = s[0 .. $-1];
	return s;
}

/// ASCII whitespace trimming (space and tab)
string stripA(string s)
@safe pure nothrow {
	return stripLeftA(stripRightA(s));
}

/// Finds the first occurence of any of the characters in `chars`
ptrdiff_t indexOfAny(string str, string chars)
@safe pure {
	foreach (i, char ch; str)
		if (chars.canFind(ch))
			return i;
	return -1;
}
alias countUntilAny = indexOfAny;

/**
	Finds the closing bracket (works with any of '[', '$(LPAREN)', '<', '{').

	Params:
		str = input string
		nested = whether to skip nested brackets
	Returns:
		The index of the closing bracket or -1 for unbalanced strings
		and strings that don't start with a bracket.
*/
ptrdiff_t matchBracket(string str, bool nested = true)
@safe pure nothrow {
	if (str.length < 2) return -1;

	char open = str[0], close = void;
	switch (str[0]) {
		case '[': close = ']'; break;
		case '(': close = ')'; break;
		case '<': close = '>'; break;
		case '{': close = '}'; break;
		default: return -1;
	}

	size_t level = 1;
	foreach (i, char c; str[1 .. $]) {
		if (nested && c == open) ++level;
		else if (c == close) --level;
		if (level == 0) return i + 1;
	}
	return -1;
}

@safe unittest
{
	static struct Test { string str; ptrdiff_t res; }
	enum tests = [
		Test("[foo]", 4), Test("<bar>", 4), Test("{baz}", 4),
		Test("[", -1), Test("[foo", -1), Test("ab[f]", -1),
		Test("[foo[bar]]", 9), Test("[foo{bar]]", 8),
	];
	foreach (test; tests)
		assert(matchBracket(test.str) == test.res);
	assert(matchBracket("[foo[bar]]", false) == 8);
	static assert(matchBracket("[foo]") == 4);
}

/// Same as std.string.format, just using an allocator.
string formatAlloc(ARGS...)(IAllocator alloc, string fmt, ARGS args)
{
	auto app = AllocAppender!string(alloc);
	formattedWrite(&app, fmt, args);
	return app.data;
}

/// Special version of icmp() with optimization for ASCII characters
int icmp2(string a, string b)
@safe pure {
	size_t i = 0, j = 0;

	// fast skip equal prefix
	size_t min_len = min(a.length, b.length);
	while( i < min_len && a[i] == b[i] ) i++;
	if( i > 0 && (a[i-1] & 0x80) ) i--; // don't stop half-way in a UTF-8 sequence
	j = i;

	// compare the differing character and the rest of the string
	while(i < a.length && j < b.length){
		uint ac = cast(uint)a[i];
		uint bc = cast(uint)b[j];
		if( !((ac | bc) & 0x80) ){
			i++;
			j++;
			if( ac >= 'A' && ac <= 'Z' ) ac += 'a' - 'A';
			if( bc >= 'A' && bc <= 'Z' ) bc += 'a' - 'A';
			if( ac < bc ) return -1;
			else if( ac > bc ) return 1;
		} else {
			dchar acp = decode(a, i);
			dchar bcp = decode(b, j);
			if( acp != bcp ){
				acp = std.uni.toLower(acp);
				bcp = std.uni.toLower(bcp);
				if( acp < bcp ) return -1;
				else if( acp > bcp ) return 1;
			}
		}
	}

	if( i < a.length ) return 1;
	else if( j < b.length ) return -1;

	assert(i == a.length || j == b.length, "Strings equal but we didn't fully compare them!?");
	return 0;
}
Initial commit. The library is able to support simple TCP servers in the current state. The API is still mostly compatible with mainline vibe.d, but the driver systen has been replaced by the eventcore library and sockets/files/timers/... are now structs with automatic reference counting instead of GC collected classes. The stream interfaces have been removed for now. 2016-03-01 19:30:42 +00:00			`/**`
			`Utility functions for string processing`

Correct copyright holder. rejectedsoftware e.K. doesn't exist anymore since mid-2019. 2020-01-27 18:20:52 +00:00			`Copyright: © 2012-2014 Sönke Ludwig`
Initial commit. The library is able to support simple TCP servers in the current state. The API is still mostly compatible with mainline vibe.d, but the driver systen has been replaced by the eventcore library and sockets/files/timers/... are now structs with automatic reference counting instead of GC collected classes. The stream interfaces have been removed for now. 2016-03-01 19:30:42 +00:00			`License: Subject to the terms of the MIT license, as written in the included LICENSE.txt file.`
			`Authors: Sönke Ludwig`
			`*/`
			`module vibe.internal.string;`

			`public import std.string;`

			`import vibe.internal.array;`
Use std.experimental.allocator and remove the custom allocator module. 2016-11-08 14:32:25 +00:00			`import vibe.internal.allocator;`
Initial commit. The library is able to support simple TCP servers in the current state. The API is still mostly compatible with mainline vibe.d, but the driver systen has been replaced by the eventcore library and sockets/files/timers/... are now structs with automatic reference counting instead of GC collected classes. The stream interfaces have been removed for now. 2016-03-01 19:30:42 +00:00
			`import std.algorithm;`
			`import std.array;`
			`import std.ascii;`
			`import std.format;`
			`import std.uni;`
			`import std.utf;`
			`import core.exception;`


			`/**`
			`Takes a string with possibly invalid UTF8 sequences and outputs a valid UTF8 string as near to`
			`the original as possible.`
			`*/`
Improve performance of sanitizeUTF8/readFileUTF8. Uses std.encoding.sanitize, which returns the original string, if all code points are properly encoded. Note that the performance could still be improved considerably by iterating over multiple bytes at once, fast skipping over characters that don't have the most significant bit set. 2019-10-22 09:33:57 +00:00			`string sanitizeUTF8(immutable(ubyte)[] str)`
Initial commit. The library is able to support simple TCP servers in the current state. The API is still mostly compatible with mainline vibe.d, but the driver systen has been replaced by the eventcore library and sockets/files/timers/... are now structs with automatic reference counting instead of GC collected classes. The stream interfaces have been removed for now. 2016-03-01 19:30:42 +00:00			`@safe pure {`
Improve performance of sanitizeUTF8/readFileUTF8. Uses std.encoding.sanitize, which returns the original string, if all code points are properly encoded. Note that the performance could still be improved considerably by iterating over multiple bytes at once, fast skipping over characters that don't have the most significant bit set. 2019-10-22 09:33:57 +00:00			`import std.encoding : sanitize;`
			`auto ustr = cast(immutable(char)[])str;`
			`return () @trusted { return sanitize(ustr); } ();`
			`}`
			`/// ditto`
			`string sanitizeUTF8(in ubyte[] str)`
			`@trusted pure {`
			`import std.encoding : sanitize;`
			`auto ustr = cast(immutable(char)[])str;`
			`auto ret = sanitize(ustr);`
			`if (ret.ptr is ustr.ptr) return ustr.idup;`
			`else return ret;`
Initial commit. The library is able to support simple TCP servers in the current state. The API is still mostly compatible with mainline vibe.d, but the driver systen has been replaced by the eventcore library and sockets/files/timers/... are now structs with automatic reference counting instead of GC collected classes. The stream interfaces have been removed for now. 2016-03-01 19:30:42 +00:00			`}`

			`/**`
			`Strips the byte order mark of an UTF8 encoded string.`
			`This is useful when the string is coming from a file.`
			`*/`
			`string stripUTF8Bom(string str)`
			`@safe pure nothrow {`
			`if (str.length >= 3 && str[0 .. 3] == [0xEF, 0xBB, 0xBF])`
			`return str[3 ..$];`
			`return str;`
			`}`


			`/**`
			`Checks if all characters in 'str' are contained in 'chars'.`
			`*/`
			`bool allOf(string str, string chars)`
			`@safe pure {`
			`foreach (dchar ch; str)`
			`if (!chars.canFind(ch))`
			`return false;`
			`return true;`
			`}`

			`ptrdiff_t indexOfCT(Char)(in Char[] s, dchar c, CaseSensitive cs = CaseSensitive.yes)`
			`@safe pure {`
			`if (__ctfe) {`
			`if (cs == CaseSensitive.yes) {`
			`foreach (i, dchar ch; s)`
			`if (ch == c)`
			`return i;`
			`} else {`
			`c = std.uni.toLower(c);`
			`foreach (i, dchar ch; s)`
			`if (std.uni.toLower(ch) == c)`
			`return i;`
			`}`
			`return -1;`
			`} else return std.string.indexOf(s, c, cs);`
			`}`
			`ptrdiff_t indexOfCT(Char)(in Char[] s, in Char[] needle)`
			`{`
			`if (__ctfe) {`
			`if (s.length < needle.length) return -1;`
			`foreach (i; 0 .. s.length - needle.length)`
			`if (s[i .. i+needle.length] == needle)`
			`return i;`
			`return -1;`
			`} else return std.string.indexOf(s, needle);`
			`}`

			`/**`
			`Checks if any character in 'str' is contained in 'chars'.`
			`*/`
			`bool anyOf(string str, string chars)`
			`@safe pure {`
			`foreach (ch; str)`
			`if (chars.canFind(ch))`
			`return true;`
			`return false;`
			`}`


			`/// ASCII whitespace trimming (space and tab)`
			`string stripLeftA(string s)`
			`@safe pure nothrow {`
			`while (s.length > 0 && (s[0] == ' ' \|\| s[0] == '\t'))`
			`s = s[1 .. $];`
			`return s;`
			`}`

			`/// ASCII whitespace trimming (space and tab)`
			`string stripRightA(string s)`
			`@safe pure nothrow {`
			`while (s.length > 0 && (s[$-1] == ' ' \|\| s[$-1] == '\t'))`
			`s = s[0 .. $-1];`
			`return s;`
			`}`

			`/// ASCII whitespace trimming (space and tab)`
			`string stripA(string s)`
			`@safe pure nothrow {`
			`return stripLeftA(stripRightA(s));`
			`}`

			/// Finds the first occurence of any of the characters in `chars`
Replace `sizediff_t` with `ptrdiff_t` 2019-08-31 03:30:50 +00:00			`ptrdiff_t indexOfAny(string str, string chars)`
Initial commit. The library is able to support simple TCP servers in the current state. The API is still mostly compatible with mainline vibe.d, but the driver systen has been replaced by the eventcore library and sockets/files/timers/... are now structs with automatic reference counting instead of GC collected classes. The stream interfaces have been removed for now. 2016-03-01 19:30:42 +00:00			`@safe pure {`
			`foreach (i, char ch; str)`
			`if (chars.canFind(ch))`
			`return i;`
			`return -1;`
			`}`
			`alias countUntilAny = indexOfAny;`

			`/**`
			`Finds the closing bracket (works with any of '[', '$(LPAREN)', '<', '{').`

			`Params:`
			`str = input string`
			`nested = whether to skip nested brackets`
			`Returns:`
			`The index of the closing bracket or -1 for unbalanced strings`
			`and strings that don't start with a bracket.`
			`*/`
Replace `sizediff_t` with `ptrdiff_t` 2019-08-31 03:30:50 +00:00			`ptrdiff_t matchBracket(string str, bool nested = true)`
Initial commit. The library is able to support simple TCP servers in the current state. The API is still mostly compatible with mainline vibe.d, but the driver systen has been replaced by the eventcore library and sockets/files/timers/... are now structs with automatic reference counting instead of GC collected classes. The stream interfaces have been removed for now. 2016-03-01 19:30:42 +00:00			`@safe pure nothrow {`
			`if (str.length < 2) return -1;`

			`char open = str[0], close = void;`
			`switch (str[0]) {`
			`case '[': close = ']'; break;`
			`case '(': close = ')'; break;`
			`case '<': close = '>'; break;`
			`case '{': close = '}'; break;`
			`default: return -1;`
			`}`

			`size_t level = 1;`
			`foreach (i, char c; str[1 .. $]) {`
			`if (nested && c == open) ++level;`
			`else if (c == close) --level;`
			`if (level == 0) return i + 1;`
			`}`
			`return -1;`
			`}`

			`@safe unittest`
			`{`
Replace `sizediff_t` with `ptrdiff_t` 2019-08-31 03:30:50 +00:00			`static struct Test { string str; ptrdiff_t res; }`
Initial commit. The library is able to support simple TCP servers in the current state. The API is still mostly compatible with mainline vibe.d, but the driver systen has been replaced by the eventcore library and sockets/files/timers/... are now structs with automatic reference counting instead of GC collected classes. The stream interfaces have been removed for now. 2016-03-01 19:30:42 +00:00			`enum tests = [`
			`Test("[foo]", 4), Test("<bar>", 4), Test("{baz}", 4),`
			`Test("[", -1), Test("[foo", -1), Test("ab[f]", -1),`
			`Test("[foo[bar]]", 9), Test("[foo{bar]]", 8),`
			`];`
			`foreach (test; tests)`
			`assert(matchBracket(test.str) == test.res);`
			`assert(matchBracket("[foo[bar]]", false) == 8);`
			`static assert(matchBracket("[foo]") == 4);`
			`}`

			`/// Same as std.string.format, just using an allocator.`
Use std.experimental.allocator and remove the custom allocator module. 2016-11-08 14:32:25 +00:00			`string formatAlloc(ARGS...)(IAllocator alloc, string fmt, ARGS args)`
Initial commit. The library is able to support simple TCP servers in the current state. The API is still mostly compatible with mainline vibe.d, but the driver systen has been replaced by the eventcore library and sockets/files/timers/... are now structs with automatic reference counting instead of GC collected classes. The stream interfaces have been removed for now. 2016-03-01 19:30:42 +00:00			`{`
			`auto app = AllocAppender!string(alloc);`
			`formattedWrite(&app, fmt, args);`
			`return app.data;`
			`}`

			`/// Special version of icmp() with optimization for ASCII characters`
			`int icmp2(string a, string b)`
			`@safe pure {`
			`size_t i = 0, j = 0;`

			`// fast skip equal prefix`
			`size_t min_len = min(a.length, b.length);`
			`while( i < min_len && a[i] == b[i] ) i++;`
			`if( i > 0 && (a[i-1] & 0x80) ) i--; // don't stop half-way in a UTF-8 sequence`
			`j = i;`

			`// compare the differing character and the rest of the string`
			`while(i < a.length && j < b.length){`
			`uint ac = cast(uint)a[i];`
			`uint bc = cast(uint)b[j];`
			`if( !((ac \| bc) & 0x80) ){`
			`i++;`
			`j++;`
			`if( ac >= 'A' && ac <= 'Z' ) ac += 'a' - 'A';`
			`if( bc >= 'A' && bc <= 'Z' ) bc += 'a' - 'A';`
			`if( ac < bc ) return -1;`
			`else if( ac > bc ) return 1;`
			`} else {`
			`dchar acp = decode(a, i);`
			`dchar bcp = decode(b, j);`
			`if( acp != bcp ){`
			`acp = std.uni.toLower(acp);`
			`bcp = std.uni.toLower(bcp);`
			`if( acp < bcp ) return -1;`
			`else if( acp > bcp ) return 1;`
			`}`
			`}`
			`}`

			`if( i < a.length ) return 1;`
			`else if( j < b.length ) return -1;`

			`assert(i == a.length \|\| j == b.length, "Strings equal but we didn't fully compare them!?");`
			`return 0;`
			`}`