src: replace naive search in Buffer::IndexOf
Adds the string search implementation from v8 which uses naive search if pattern length < 8 or to a specific badness then uses Boyer-Moore-Horspool Added benchmark shows the expected improvements Added option to use ucs2 encoding with Buffer::IndexOf Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Trevor Norris <trev.norris@gmail.com> PR-URL: https://github.com/nodejs/node/pull/2539
This commit is contained in:
parent
aeee956ac7
commit
a18dd7b788
38
benchmark/buffers/buffer-indexof.js
Normal file
38
benchmark/buffers/buffer-indexof.js
Normal file
@ -0,0 +1,38 @@
|
||||
var common = require('../common.js');
|
||||
var fs = require('fs');
|
||||
|
||||
var bench = common.createBenchmark(main, {
|
||||
search: ['@', 'SQ', '10x', '--l', 'Alice', 'Gryphon', 'Panther',
|
||||
'Ou est ma chatte?', 'found it very', 'among mad people',
|
||||
'neighbouring pool', 'Soo--oop', 'aaaaaaaaaaaaaaaaa',
|
||||
'venture to go near the house till she had brought herself down to',
|
||||
'</i> to the Caterpillar'],
|
||||
encoding: ['undefined', 'utf8', 'ucs2', 'binary'],
|
||||
type: ['buffer', 'string'],
|
||||
iter: [1]
|
||||
});
|
||||
|
||||
function main(conf) {
|
||||
var iter = (conf.iter) * 100000;
|
||||
var aliceBuffer = fs.readFileSync(__dirname + '/../fixtures/alice.html');
|
||||
var search = conf.search;
|
||||
var encoding = conf.encoding;
|
||||
|
||||
if (encoding === 'undefined') {
|
||||
encoding = undefined;
|
||||
}
|
||||
|
||||
if (encoding === 'ucs2') {
|
||||
aliceBuffer = new Buffer(aliceBuffer.toString(), encoding);
|
||||
}
|
||||
|
||||
if (conf.type === 'buffer') {
|
||||
search = new Buffer(new Buffer(search).toString(), encoding);
|
||||
}
|
||||
|
||||
bench.start();
|
||||
for (var i = 0; i < iter; i++) {
|
||||
aliceBuffer.indexOf(search, 0, encoding);
|
||||
}
|
||||
bench.end(iter);
|
||||
}
|
3865
benchmark/fixtures/alice.html
Normal file
3865
benchmark/fixtures/alice.html
Normal file
File diff suppressed because it is too large
Load Diff
@ -410,20 +410,53 @@ Buffer.prototype.compare = function compare(b) {
|
||||
return binding.compare(this, b);
|
||||
};
|
||||
|
||||
function slowIndexOf(buffer, val, byteOffset, encoding) {
|
||||
var loweredCase = false;
|
||||
for (;;) {
|
||||
switch (encoding) {
|
||||
case 'utf8':
|
||||
case 'utf-8':
|
||||
case 'ucs2':
|
||||
case 'ucs-2':
|
||||
case 'utf16le':
|
||||
case 'utf-16le':
|
||||
case 'binary':
|
||||
return binding.indexOfString(buffer, val, byteOffset, encoding);
|
||||
|
||||
Buffer.prototype.indexOf = function indexOf(val, byteOffset) {
|
||||
case 'base64':
|
||||
case 'ascii':
|
||||
case 'hex':
|
||||
return binding.indexOfBuffer(
|
||||
buffer, Buffer(val, encoding), byteOffset, encoding);
|
||||
|
||||
default:
|
||||
if (loweredCase) {
|
||||
throw new TypeError('Unknown encoding: ' + encoding);
|
||||
}
|
||||
|
||||
encoding = ('' + encoding).toLowerCase();
|
||||
loweredCase = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Buffer.prototype.indexOf = function indexOf(val, byteOffset, encoding) {
|
||||
if (byteOffset > 0x7fffffff)
|
||||
byteOffset = 0x7fffffff;
|
||||
else if (byteOffset < -0x80000000)
|
||||
byteOffset = -0x80000000;
|
||||
byteOffset >>= 0;
|
||||
|
||||
if (typeof val === 'string')
|
||||
return binding.indexOfString(this, val, byteOffset);
|
||||
if (val instanceof Buffer)
|
||||
return binding.indexOfBuffer(this, val, byteOffset);
|
||||
if (typeof val === 'number')
|
||||
if (typeof val === 'string') {
|
||||
if (encoding === undefined) {
|
||||
return binding.indexOfString(this, val, byteOffset, encoding);
|
||||
}
|
||||
return slowIndexOf(this, val, byteOffset, encoding);
|
||||
} else if (val instanceof Buffer) {
|
||||
return binding.indexOfBuffer(this, val, byteOffset, encoding);
|
||||
} else if (typeof val === 'number') {
|
||||
return binding.indexOfNumber(this, val, byteOffset);
|
||||
}
|
||||
|
||||
throw new TypeError('val must be string, number or Buffer');
|
||||
};
|
||||
|
1
node.gyp
1
node.gyp
@ -169,6 +169,7 @@
|
||||
'src/util.h',
|
||||
'src/util-inl.h',
|
||||
'src/util.cc',
|
||||
'src/string_search.cc',
|
||||
'deps/http_parser/http_parser.h',
|
||||
'deps/v8/include/v8.h',
|
||||
'deps/v8/include/v8-debug.h',
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include "env.h"
|
||||
#include "env-inl.h"
|
||||
#include "string_bytes.h"
|
||||
#include "string_search.h"
|
||||
#include "util.h"
|
||||
#include "util-inl.h"
|
||||
#include "v8-profiler.h"
|
||||
@ -792,88 +793,157 @@ void Compare(const FunctionCallbackInfo<Value> &args) {
|
||||
}
|
||||
|
||||
|
||||
int32_t IndexOf(const char* haystack,
|
||||
size_t h_length,
|
||||
const char* needle,
|
||||
size_t n_length) {
|
||||
CHECK_GE(h_length, n_length);
|
||||
// TODO(trevnorris): Implement Boyer-Moore string search algorithm.
|
||||
for (size_t i = 0; i < h_length - n_length + 1; i++) {
|
||||
if (haystack[i] == needle[0]) {
|
||||
if (memcmp(haystack + i, needle, n_length) == 0)
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
void IndexOfString(const FunctionCallbackInfo<Value>& args) {
|
||||
ASSERT(args[1]->IsString());
|
||||
ASSERT(args[2]->IsNumber());
|
||||
|
||||
enum encoding enc = ParseEncoding(args.GetIsolate(),
|
||||
args[3],
|
||||
UTF8);
|
||||
|
||||
THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]);
|
||||
SPREAD_ARG(args[0], ts_obj);
|
||||
|
||||
node::Utf8Value str(args.GetIsolate(), args[1]);
|
||||
int32_t offset_i32 = args[2]->Int32Value();
|
||||
uint32_t offset;
|
||||
Local<String> needle = args[1].As<String>();
|
||||
const char* haystack = ts_obj_data;
|
||||
const size_t haystack_length = ts_obj_length;
|
||||
const size_t needle_length = needle->Utf8Length();
|
||||
|
||||
if (offset_i32 < 0) {
|
||||
if (offset_i32 + static_cast<int32_t>(ts_obj_length) < 0)
|
||||
offset = 0;
|
||||
else
|
||||
offset = static_cast<uint32_t>(ts_obj_length + offset_i32);
|
||||
} else {
|
||||
offset = static_cast<uint32_t>(offset_i32);
|
||||
|
||||
if (needle_length == 0 || haystack_length == 0) {
|
||||
return args.GetReturnValue().Set(-1);
|
||||
}
|
||||
|
||||
if (str.length() == 0 ||
|
||||
ts_obj_length == 0 ||
|
||||
(offset != 0 && str.length() + offset <= str.length()) ||
|
||||
str.length() + offset > ts_obj_length)
|
||||
int64_t offset_i64 = args[2]->IntegerValue();
|
||||
size_t offset = 0;
|
||||
|
||||
if (offset_i64 < 0) {
|
||||
if (offset_i64 + static_cast<int64_t>(haystack_length) < 0) {
|
||||
offset = 0;
|
||||
} else {
|
||||
offset = static_cast<size_t>(haystack_length + offset_i64);
|
||||
}
|
||||
} else {
|
||||
offset = static_cast<size_t>(offset_i64);
|
||||
}
|
||||
|
||||
if (haystack_length < offset || needle_length + offset > haystack_length) {
|
||||
return args.GetReturnValue().Set(-1);
|
||||
}
|
||||
|
||||
int32_t r =
|
||||
IndexOf(ts_obj_data + offset, ts_obj_length - offset, *str, str.length());
|
||||
args.GetReturnValue().Set(r == -1 ? -1 : static_cast<int32_t>(r + offset));
|
||||
size_t result = haystack_length;
|
||||
|
||||
if (enc == UCS2) {
|
||||
String::Value needle_value(needle);
|
||||
if (*needle_value == nullptr)
|
||||
return args.GetReturnValue().Set(-1);
|
||||
|
||||
if (haystack_length < 2 || needle_value.length() < 1) {
|
||||
return args.GetReturnValue().Set(-1);
|
||||
}
|
||||
|
||||
result = SearchString(reinterpret_cast<const uint16_t*>(haystack),
|
||||
haystack_length / 2,
|
||||
reinterpret_cast<const uint16_t*>(*needle_value),
|
||||
needle_value.length(),
|
||||
offset / 2);
|
||||
result *= 2;
|
||||
} else if (enc == UTF8) {
|
||||
String::Utf8Value needle_value(needle);
|
||||
if (*needle_value == nullptr)
|
||||
return args.GetReturnValue().Set(-1);
|
||||
|
||||
result = SearchString(reinterpret_cast<const uint8_t*>(haystack),
|
||||
haystack_length,
|
||||
reinterpret_cast<const uint8_t*>(*needle_value),
|
||||
needle_length,
|
||||
offset);
|
||||
} else if (enc == BINARY) {
|
||||
uint8_t* needle_data = static_cast<uint8_t*>(malloc(needle_length));
|
||||
if (needle_data == nullptr) {
|
||||
return args.GetReturnValue().Set(-1);
|
||||
}
|
||||
needle->WriteOneByte(
|
||||
needle_data, 0, needle_length, String::NO_NULL_TERMINATION);
|
||||
|
||||
result = SearchString(reinterpret_cast<const uint8_t*>(haystack),
|
||||
haystack_length,
|
||||
needle_data,
|
||||
needle_length,
|
||||
offset);
|
||||
free(needle_data);
|
||||
}
|
||||
|
||||
args.GetReturnValue().Set(
|
||||
result == haystack_length ? -1 : static_cast<int>(result));
|
||||
}
|
||||
|
||||
|
||||
void IndexOfBuffer(const FunctionCallbackInfo<Value>& args) {
|
||||
ASSERT(args[1]->IsObject());
|
||||
ASSERT(args[2]->IsNumber());
|
||||
|
||||
enum encoding enc = ParseEncoding(args.GetIsolate(),
|
||||
args[3],
|
||||
UTF8);
|
||||
|
||||
THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]);
|
||||
SPREAD_ARG(args[0], ts_obj);
|
||||
SPREAD_ARG(args[1], buf);
|
||||
const int32_t offset_i32 = args[2]->Int32Value();
|
||||
uint32_t offset;
|
||||
|
||||
if (buf_length > 0)
|
||||
CHECK_NE(buf_data, nullptr);
|
||||
|
||||
if (offset_i32 < 0) {
|
||||
if (offset_i32 + static_cast<int32_t>(ts_obj_length) < 0)
|
||||
offset = 0;
|
||||
else
|
||||
offset = static_cast<uint32_t>(ts_obj_length + offset_i32);
|
||||
} else {
|
||||
offset = static_cast<uint32_t>(offset_i32);
|
||||
const char* haystack = ts_obj_data;
|
||||
const size_t haystack_length = ts_obj_length;
|
||||
const char* needle = buf_data;
|
||||
const size_t needle_length = buf_length;
|
||||
|
||||
if (needle_length == 0 || haystack_length == 0) {
|
||||
return args.GetReturnValue().Set(-1);
|
||||
}
|
||||
|
||||
if (buf_length == 0 ||
|
||||
ts_obj_length == 0 ||
|
||||
(offset != 0 && buf_length + offset <= buf_length) ||
|
||||
buf_length + offset > ts_obj_length)
|
||||
int64_t offset_i64 = args[2]->IntegerValue();
|
||||
size_t offset = 0;
|
||||
|
||||
if (offset_i64 < 0) {
|
||||
if (offset_i64 + static_cast<int64_t>(haystack_length) < 0)
|
||||
offset = 0;
|
||||
else
|
||||
offset = static_cast<size_t>(haystack_length + offset_i64);
|
||||
} else {
|
||||
offset = static_cast<size_t>(offset_i64);
|
||||
}
|
||||
|
||||
if (haystack_length < offset || needle_length + offset > haystack_length) {
|
||||
return args.GetReturnValue().Set(-1);
|
||||
}
|
||||
|
||||
int32_t r =
|
||||
IndexOf(ts_obj_data + offset, ts_obj_length - offset, buf_data, buf_length);
|
||||
args.GetReturnValue().Set(r == -1 ? -1 : static_cast<int32_t>(r + offset));
|
||||
size_t result = haystack_length;
|
||||
|
||||
if (enc == UCS2) {
|
||||
if (haystack_length < 2 || needle_length < 2) {
|
||||
return args.GetReturnValue().Set(-1);
|
||||
}
|
||||
result = SearchString(
|
||||
reinterpret_cast<const uint16_t*>(haystack),
|
||||
haystack_length / 2,
|
||||
reinterpret_cast<const uint16_t*>(needle),
|
||||
needle_length / 2,
|
||||
offset / 2);
|
||||
result *= 2;
|
||||
} else {
|
||||
result = SearchString(
|
||||
reinterpret_cast<const uint8_t*>(haystack),
|
||||
haystack_length,
|
||||
reinterpret_cast<const uint8_t*>(needle),
|
||||
needle_length,
|
||||
offset);
|
||||
}
|
||||
|
||||
args.GetReturnValue().Set(
|
||||
result == haystack_length ? -1 : static_cast<int>(result));
|
||||
}
|
||||
|
||||
|
||||
void IndexOfNumber(const FunctionCallbackInfo<Value>& args) {
|
||||
ASSERT(args[1]->IsNumber());
|
||||
ASSERT(args[2]->IsNumber());
|
||||
@ -882,16 +952,16 @@ void IndexOfNumber(const FunctionCallbackInfo<Value>& args) {
|
||||
SPREAD_ARG(args[0], ts_obj);
|
||||
|
||||
uint32_t needle = args[1]->Uint32Value();
|
||||
int32_t offset_i32 = args[2]->Int32Value();
|
||||
uint32_t offset;
|
||||
int64_t offset_i64 = args[2]->IntegerValue();
|
||||
size_t offset;
|
||||
|
||||
if (offset_i32 < 0) {
|
||||
if (offset_i32 + static_cast<int32_t>(ts_obj_length) < 0)
|
||||
if (offset_i64 < 0) {
|
||||
if (offset_i64 + static_cast<int64_t>(ts_obj_length) < 0)
|
||||
offset = 0;
|
||||
else
|
||||
offset = static_cast<uint32_t>(ts_obj_length + offset_i32);
|
||||
offset = static_cast<size_t>(ts_obj_length + offset_i64);
|
||||
} else {
|
||||
offset = static_cast<uint32_t>(offset_i32);
|
||||
offset = static_cast<size_t>(offset_i64);
|
||||
}
|
||||
|
||||
if (ts_obj_length == 0 || offset + 1 > ts_obj_length)
|
||||
@ -899,8 +969,8 @@ void IndexOfNumber(const FunctionCallbackInfo<Value>& args) {
|
||||
|
||||
void* ptr = memchr(ts_obj_data + offset, needle, ts_obj_length - offset);
|
||||
char* ptr_char = static_cast<char*>(ptr);
|
||||
args.GetReturnValue().Set(
|
||||
ptr ? static_cast<int32_t>(ptr_char - ts_obj_data) : -1);
|
||||
args.GetReturnValue().Set(ptr ? static_cast<int>(ptr_char - ts_obj_data)
|
||||
: -1);
|
||||
}
|
||||
|
||||
|
||||
|
10
src/string_search.cc
Normal file
10
src/string_search.cc
Normal file
@ -0,0 +1,10 @@
|
||||
#include "string_search.h"
|
||||
|
||||
namespace node {
|
||||
namespace stringsearch {
|
||||
|
||||
int StringSearchBase::kBadCharShiftTable[kUC16AlphabetSize];
|
||||
int StringSearchBase::kGoodSuffixShiftTable[kBMMaxShift + 1];
|
||||
int StringSearchBase::kSuffixTable[kBMMaxShift + 1];
|
||||
}
|
||||
} // namespace node::stringsearch
|
671
src/string_search.h
Normal file
671
src/string_search.h
Normal file
@ -0,0 +1,671 @@
|
||||
// Copyright 2011 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef SRC_STRING_SEARCH_H_
|
||||
#define SRC_STRING_SEARCH_H_
|
||||
|
||||
#include "node.h"
|
||||
#include <string.h>
|
||||
|
||||
namespace node {
|
||||
namespace stringsearch {
|
||||
|
||||
|
||||
// Returns the maximum of the two parameters.
|
||||
template <typename T>
|
||||
T Max(T a, T b) {
|
||||
return a < b ? b : a;
|
||||
}
|
||||
|
||||
|
||||
static const uint32_t kMaxOneByteCharCodeU = 0xff;
|
||||
|
||||
|
||||
static inline size_t NonOneByteStart(const uint16_t* chars, size_t length) {
|
||||
const uint16_t* limit = chars + length;
|
||||
const uint16_t* start = chars;
|
||||
while (chars < limit) {
|
||||
if (*chars > kMaxOneByteCharCodeU)
|
||||
return static_cast<size_t>(chars - start);
|
||||
++chars;
|
||||
}
|
||||
return static_cast<size_t>(chars - start);
|
||||
}
|
||||
|
||||
|
||||
static inline bool IsOneByte(const uint16_t* chars, size_t length) {
|
||||
return NonOneByteStart(chars, length) >= length;
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
class Vector {
|
||||
public:
|
||||
Vector(T* data, size_t length) : start_(data), length_(length) {
|
||||
ASSERT(length > 0 && data != nullptr);
|
||||
}
|
||||
|
||||
// Returns the length of the vector.
|
||||
size_t length() const { return length_; }
|
||||
|
||||
T* start() const { return start_; }
|
||||
|
||||
// Access individual vector elements - checks bounds in debug mode.
|
||||
T& operator[](size_t index) const {
|
||||
ASSERT(0 <= index && index < length_);
|
||||
return start_[index];
|
||||
}
|
||||
|
||||
const T& at(size_t index) const { return operator[](index); }
|
||||
|
||||
bool operator==(const Vector<T>& other) const {
|
||||
if (length_ != other.length_)
|
||||
return false;
|
||||
if (start_ == other.start_)
|
||||
return true;
|
||||
for (size_t i = 0; i < length_; ++i) {
|
||||
if (start_[i] != other.start_[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
T* start_;
|
||||
size_t length_;
|
||||
};
|
||||
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
// String Search object.
|
||||
//---------------------------------------------------------------------
|
||||
|
||||
// Class holding constants and methods that apply to all string search variants,
|
||||
// independently of subject and pattern char size.
|
||||
class StringSearchBase {
|
||||
protected:
|
||||
// Cap on the maximal shift in the Boyer-Moore implementation. By setting a
|
||||
// limit, we can fix the size of tables. For a needle longer than this limit,
|
||||
// search will not be optimal, since we only build tables for a suffix
|
||||
// of the string, but it is a safe approximation.
|
||||
static const int kBMMaxShift = 250;
|
||||
|
||||
// Reduce alphabet to this size.
|
||||
// One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
|
||||
// proportional to the input alphabet. We reduce the alphabet size by
|
||||
// equating input characters modulo a smaller alphabet size. This gives
|
||||
// a potentially less efficient searching, but is a safe approximation.
|
||||
// For needles using only characters in the same Unicode 256-code point page,
|
||||
// there is no search speed degradation.
|
||||
static const int kLatin1AlphabetSize = 256;
|
||||
static const int kUC16AlphabetSize = 256;
|
||||
|
||||
// Bad-char shift table stored in the state. It's length is the alphabet size.
|
||||
// For patterns below this length, the skip length of Boyer-Moore is too short
|
||||
// to compensate for the algorithmic overhead compared to simple brute force.
|
||||
static const int kBMMinPatternLength = 8;
|
||||
|
||||
// Store for the BoyerMoore(Horspool) bad char shift table.
|
||||
static int kBadCharShiftTable[kUC16AlphabetSize];
|
||||
// Store for the BoyerMoore good suffix shift table.
|
||||
static int kGoodSuffixShiftTable[kBMMaxShift + 1];
|
||||
// Table used temporarily while building the BoyerMoore good suffix
|
||||
// shift table.
|
||||
static int kSuffixTable[kBMMaxShift + 1];
|
||||
|
||||
static inline bool IsOneByteString(Vector<const uint8_t> string) {
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool IsOneByteString(Vector<const uint16_t> string) {
|
||||
return IsOneByte(string.start(), string.length());
|
||||
}
|
||||
};
|
||||
|
||||
template <typename PatternChar, typename SubjectChar>
|
||||
class StringSearch : private StringSearchBase {
|
||||
public:
|
||||
explicit StringSearch(Vector<const PatternChar> pattern)
|
||||
: pattern_(pattern), start_(0) {
|
||||
if (pattern.length() >= kBMMaxShift) {
|
||||
start_ = pattern.length() - kBMMaxShift;
|
||||
}
|
||||
|
||||
if (sizeof(PatternChar) > sizeof(SubjectChar)) {
|
||||
if (!IsOneByteString(pattern_)) {
|
||||
strategy_ = &FailSearch;
|
||||
return;
|
||||
}
|
||||
}
|
||||
size_t pattern_length = pattern_.length();
|
||||
CHECK_GT(pattern_length, 0);
|
||||
if (pattern_length < kBMMinPatternLength) {
|
||||
if (pattern_length == 1) {
|
||||
strategy_ = &SingleCharSearch;
|
||||
return;
|
||||
}
|
||||
strategy_ = &LinearSearch;
|
||||
return;
|
||||
}
|
||||
strategy_ = &InitialSearch;
|
||||
}
|
||||
|
||||
size_t Search(Vector<const SubjectChar> subject, size_t index) {
|
||||
return strategy_(this, subject, index);
|
||||
}
|
||||
|
||||
static inline int AlphabetSize() {
|
||||
if (sizeof(PatternChar) == 1) {
|
||||
// Latin1 needle.
|
||||
return kLatin1AlphabetSize;
|
||||
} else {
|
||||
// UC16 needle.
|
||||
return kUC16AlphabetSize;
|
||||
}
|
||||
|
||||
static_assert(sizeof(PatternChar) == sizeof(uint8_t) ||
|
||||
sizeof(PatternChar) == sizeof(uint16_t),
|
||||
"sizeof(PatternChar) == sizeof(uint16_t) || sizeof(uint8_t)");
|
||||
}
|
||||
|
||||
private:
|
||||
typedef size_t (*SearchFunction)( // NOLINT - it's not a cast!
|
||||
StringSearch<PatternChar, SubjectChar>*,
|
||||
Vector<const SubjectChar>,
|
||||
size_t);
|
||||
|
||||
static size_t FailSearch(StringSearch<PatternChar, SubjectChar>*,
|
||||
Vector<const SubjectChar> subject,
|
||||
size_t) {
|
||||
return subject.length();
|
||||
}
|
||||
|
||||
static size_t SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
|
||||
Vector<const SubjectChar> subject,
|
||||
size_t start_index);
|
||||
|
||||
static size_t LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
|
||||
Vector<const SubjectChar> subject,
|
||||
size_t start_index);
|
||||
|
||||
static size_t InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
|
||||
Vector<const SubjectChar> subject,
|
||||
size_t start_index);
|
||||
|
||||
static size_t BoyerMooreHorspoolSearch(
|
||||
StringSearch<PatternChar, SubjectChar>* search,
|
||||
Vector<const SubjectChar> subject,
|
||||
size_t start_index);
|
||||
|
||||
static size_t BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
|
||||
Vector<const SubjectChar> subject,
|
||||
size_t start_index);
|
||||
|
||||
void PopulateBoyerMooreHorspoolTable();
|
||||
|
||||
void PopulateBoyerMooreTable();
|
||||
|
||||
static inline bool exceedsOneByte(uint8_t c) { return false; }
|
||||
|
||||
static inline bool exceedsOneByte(uint16_t c) {
|
||||
return c > kMaxOneByteCharCodeU;
|
||||
}
|
||||
|
||||
static inline int CharOccurrence(int* bad_char_occurrence,
|
||||
SubjectChar char_code) {
|
||||
if (sizeof(SubjectChar) == 1) {
|
||||
return bad_char_occurrence[static_cast<int>(char_code)];
|
||||
}
|
||||
if (sizeof(PatternChar) == 1) {
|
||||
if (exceedsOneByte(char_code)) {
|
||||
return -1;
|
||||
}
|
||||
return bad_char_occurrence[static_cast<unsigned int>(char_code)];
|
||||
}
|
||||
// Both pattern and subject are UC16. Reduce character to equivalence class.
|
||||
int equiv_class = char_code % kUC16AlphabetSize;
|
||||
return bad_char_occurrence[equiv_class];
|
||||
}
|
||||
|
||||
// Store for the BoyerMoore(Horspool) bad char shift table.
|
||||
// Return a table covering the last kBMMaxShift+1 positions of
|
||||
// pattern.
|
||||
int* bad_char_table() { return kBadCharShiftTable; }
|
||||
|
||||
// Store for the BoyerMoore good suffix shift table.
|
||||
int* good_suffix_shift_table() {
|
||||
// Return biased pointer that maps the range [start_..pattern_.length()
|
||||
// to the kGoodSuffixShiftTable array.
|
||||
return kGoodSuffixShiftTable - start_;
|
||||
}
|
||||
|
||||
// Table used temporarily while building the BoyerMoore good suffix
|
||||
// shift table.
|
||||
int* suffix_table() {
|
||||
// Return biased pointer that maps the range [start_..pattern_.length()
|
||||
// to the kSuffixTable array.
|
||||
return kSuffixTable - start_;
|
||||
}
|
||||
|
||||
// The pattern to search for.
|
||||
Vector<const PatternChar> pattern_;
|
||||
// Pointer to implementation of the search.
|
||||
SearchFunction strategy_;
|
||||
// Cache value of Max(0, pattern_length() - kBMMaxShift)
|
||||
size_t start_;
|
||||
};
|
||||
|
||||
|
||||
template <typename T, typename U>
|
||||
inline T AlignDown(T value, U alignment) {
|
||||
return reinterpret_cast<T>(
|
||||
(reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
|
||||
}
|
||||
|
||||
|
||||
inline uint8_t GetHighestValueByte(uint16_t character) {
|
||||
return Max(static_cast<uint8_t>(character & 0xFF),
|
||||
static_cast<uint8_t>(character >> 8));
|
||||
}
|
||||
|
||||
|
||||
inline uint8_t GetHighestValueByte(uint8_t character) { return character; }
|
||||
|
||||
|
||||
template <typename PatternChar, typename SubjectChar>
|
||||
inline size_t FindFirstCharacter(Vector<const PatternChar> pattern,
|
||||
Vector<const SubjectChar> subject, size_t index) {
|
||||
const PatternChar pattern_first_char = pattern[0];
|
||||
const size_t max_n = (subject.length() - pattern.length() + 1);
|
||||
|
||||
const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
|
||||
const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
|
||||
size_t pos = index;
|
||||
do {
|
||||
const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
|
||||
memchr(subject.start() + pos, search_byte,
|
||||
(max_n - pos) * sizeof(SubjectChar)));
|
||||
if (char_pos == nullptr)
|
||||
return subject.length();
|
||||
char_pos = AlignDown(char_pos, sizeof(SubjectChar));
|
||||
pos = static_cast<size_t>(char_pos - subject.start());
|
||||
if (subject[pos] == search_char)
|
||||
return pos;
|
||||
} while (++pos < max_n);
|
||||
|
||||
return subject.length();
|
||||
}
|
||||
|
||||
|
||||
template <>
|
||||
inline size_t FindFirstCharacter(Vector<const uint8_t> pattern,
|
||||
Vector<const uint8_t> subject,
|
||||
size_t index) {
|
||||
const uint8_t pattern_first_char = pattern[0];
|
||||
const size_t max_n = (subject.length() - pattern.length() + 1);
|
||||
|
||||
const uint8_t* char_pos = reinterpret_cast<const uint8_t*>(
|
||||
memchr(subject.start() + index, pattern_first_char, max_n - index));
|
||||
if (char_pos == nullptr)
|
||||
return subject.length();
|
||||
return static_cast<size_t>(char_pos - subject.start());
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
// Single Character Pattern Search Strategy
|
||||
//---------------------------------------------------------------------
|
||||
|
||||
template <typename PatternChar, typename SubjectChar>
|
||||
size_t StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
|
||||
StringSearch<PatternChar, SubjectChar>* search,
|
||||
Vector<const SubjectChar> subject,
|
||||
size_t index) {
|
||||
CHECK_EQ(1, search->pattern_.length());
|
||||
PatternChar pattern_first_char = search->pattern_[0];
|
||||
|
||||
if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
|
||||
return FindFirstCharacter(search->pattern_, subject, index);
|
||||
} else {
|
||||
if (sizeof(PatternChar) > sizeof(SubjectChar)) {
|
||||
if (exceedsOneByte(pattern_first_char)) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
return FindFirstCharacter(search->pattern_, subject, index);
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
// Linear Search Strategy
|
||||
//---------------------------------------------------------------------
|
||||
|
||||
template <typename PatternChar, typename SubjectChar>
|
||||
inline bool CharCompare(const PatternChar* pattern,
|
||||
const SubjectChar* subject,
|
||||
size_t length) {
|
||||
ASSERT_GT(length, 0);
|
||||
size_t pos = 0;
|
||||
do {
|
||||
if (pattern[pos] != subject[pos]) {
|
||||
return false;
|
||||
}
|
||||
pos++;
|
||||
} while (pos < length);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Simple linear search for short patterns. Never bails out.
|
||||
template <typename PatternChar, typename SubjectChar>
|
||||
size_t StringSearch<PatternChar, SubjectChar>::LinearSearch(
|
||||
StringSearch<PatternChar, SubjectChar>* search,
|
||||
Vector<const SubjectChar> subject,
|
||||
size_t index) {
|
||||
Vector<const PatternChar> pattern = search->pattern_;
|
||||
CHECK_GT(pattern.length(), 1);
|
||||
const size_t pattern_length = pattern.length();
|
||||
size_t i = index;
|
||||
const size_t n = subject.length() - pattern_length;
|
||||
while (i <= n) {
|
||||
i = FindFirstCharacter(pattern, subject, i);
|
||||
if (i == subject.length())
|
||||
return subject.length();
|
||||
ASSERT_LE(i, n);
|
||||
i++;
|
||||
|
||||
// Loop extracted to separate function to allow using return to do
|
||||
// a deeper break.
|
||||
if (CharCompare(pattern.start() + 1, subject.start() + i,
|
||||
pattern_length - 1)) {
|
||||
return i - 1;
|
||||
}
|
||||
}
|
||||
return subject.length();
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
// Boyer-Moore string search
|
||||
//---------------------------------------------------------------------
|
||||
|
||||
template <typename PatternChar, typename SubjectChar>
|
||||
size_t StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
|
||||
StringSearch<PatternChar, SubjectChar>* search,
|
||||
Vector<const SubjectChar> subject,
|
||||
size_t start_index) {
|
||||
Vector<const PatternChar> pattern = search->pattern_;
|
||||
const size_t subject_length = subject.length();
|
||||
const size_t pattern_length = pattern.length();
|
||||
// Only preprocess at most kBMMaxShift last characters of pattern.
|
||||
size_t start = search->start_;
|
||||
|
||||
int* bad_char_occurence = search->bad_char_table();
|
||||
int* good_suffix_shift = search->good_suffix_shift_table();
|
||||
|
||||
PatternChar last_char = pattern[pattern_length - 1];
|
||||
size_t index = start_index;
|
||||
// Continue search from i.
|
||||
while (index <= subject_length - pattern_length) {
|
||||
size_t j = pattern_length - 1;
|
||||
int c;
|
||||
while (last_char != (c = subject[index + j])) {
|
||||
int shift = j - CharOccurrence(bad_char_occurence, c);
|
||||
index += shift;
|
||||
if (index > subject_length - pattern_length) {
|
||||
return subject.length();
|
||||
}
|
||||
}
|
||||
while (j >= 0 && pattern[j] == (c = subject[index + j])) {
|
||||
if (j == 0) {
|
||||
return index;
|
||||
}
|
||||
j--;
|
||||
}
|
||||
if (j < start) {
|
||||
// we have matched more than our tables allow us to be smart about.
|
||||
// Fall back on BMH shift.
|
||||
index += pattern_length - 1 -
|
||||
CharOccurrence(bad_char_occurence,
|
||||
static_cast<SubjectChar>(last_char));
|
||||
} else {
|
||||
int gs_shift = good_suffix_shift[j + 1];
|
||||
int bc_occ = CharOccurrence(bad_char_occurence, c);
|
||||
int shift = j - bc_occ;
|
||||
if (gs_shift > shift) {
|
||||
shift = gs_shift;
|
||||
}
|
||||
index += shift;
|
||||
}
|
||||
}
|
||||
|
||||
return subject.length();
|
||||
}
|
||||
|
||||
template <typename PatternChar, typename SubjectChar>
|
||||
void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
|
||||
const size_t pattern_length = pattern_.length();
|
||||
const PatternChar* pattern = pattern_.start();
|
||||
// Only look at the last kBMMaxShift characters of pattern (from start_
|
||||
// to pattern_length).
|
||||
const size_t start = start_;
|
||||
const size_t length = pattern_length - start;
|
||||
|
||||
// Biased tables so that we can use pattern indices as table indices,
|
||||
// even if we only cover the part of the pattern from offset start.
|
||||
int* shift_table = good_suffix_shift_table();
|
||||
int* suffix_table = this->suffix_table();
|
||||
|
||||
// Initialize table.
|
||||
for (size_t i = start; i < pattern_length; i++) {
|
||||
shift_table[i] = length;
|
||||
}
|
||||
shift_table[pattern_length] = 1;
|
||||
suffix_table[pattern_length] = pattern_length + 1;
|
||||
|
||||
if (pattern_length <= start) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Find suffixes.
|
||||
PatternChar last_char = pattern[pattern_length - 1];
|
||||
size_t suffix = pattern_length + 1;
|
||||
{
|
||||
size_t i = pattern_length;
|
||||
while (i > start) {
|
||||
PatternChar c = pattern[i - 1];
|
||||
while (suffix <= pattern_length && c != pattern[suffix - 1]) {
|
||||
if (static_cast<size_t>(shift_table[suffix]) == length) {
|
||||
shift_table[suffix] = suffix - i;
|
||||
}
|
||||
suffix = suffix_table[suffix];
|
||||
}
|
||||
suffix_table[--i] = --suffix;
|
||||
if (suffix == pattern_length) {
|
||||
// No suffix to extend, so we check against last_char only.
|
||||
while ((i > start) && (pattern[i - 1] != last_char)) {
|
||||
if (static_cast<size_t>(shift_table[pattern_length]) == length) {
|
||||
shift_table[pattern_length] = pattern_length - i;
|
||||
}
|
||||
suffix_table[--i] = pattern_length;
|
||||
}
|
||||
if (i > start) {
|
||||
suffix_table[--i] = --suffix;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Build shift table using suffixes.
|
||||
if (suffix < pattern_length) {
|
||||
for (size_t i = start; i <= pattern_length; i++) {
|
||||
if (static_cast<size_t>(shift_table[i]) == length) {
|
||||
shift_table[i] = suffix - start;
|
||||
}
|
||||
if (i == suffix) {
|
||||
suffix = suffix_table[suffix];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
// Boyer-Moore-Horspool string search.
|
||||
//---------------------------------------------------------------------
|
||||
|
||||
template <typename PatternChar, typename SubjectChar>
|
||||
size_t StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
|
||||
StringSearch<PatternChar, SubjectChar>* search,
|
||||
Vector<const SubjectChar> subject,
|
||||
size_t start_index) {
|
||||
Vector<const PatternChar> pattern = search->pattern_;
|
||||
const size_t subject_length = subject.length();
|
||||
const size_t pattern_length = pattern.length();
|
||||
int* char_occurrences = search->bad_char_table();
|
||||
int64_t badness = -pattern_length;
|
||||
|
||||
// How bad we are doing without a good-suffix table.
|
||||
PatternChar last_char = pattern[pattern_length - 1];
|
||||
int last_char_shift =
|
||||
pattern_length - 1 -
|
||||
CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
|
||||
|
||||
// Perform search
|
||||
size_t index = start_index; // No matches found prior to this index.
|
||||
while (index <= subject_length - pattern_length) {
|
||||
size_t j = pattern_length - 1;
|
||||
int subject_char;
|
||||
while (last_char != (subject_char = subject[index + j])) {
|
||||
int bc_occ = CharOccurrence(char_occurrences, subject_char);
|
||||
int shift = j - bc_occ;
|
||||
index += shift;
|
||||
badness += 1 - shift; // at most zero, so badness cannot increase.
|
||||
if (index > subject_length - pattern_length) {
|
||||
return subject_length;
|
||||
}
|
||||
}
|
||||
j--;
|
||||
while (j >= 0 && pattern[j] == (subject[index + j])) {
|
||||
if (j == 0) {
|
||||
return index;
|
||||
}
|
||||
j--;
|
||||
}
|
||||
index += last_char_shift;
|
||||
// Badness increases by the number of characters we have
|
||||
// checked, and decreases by the number of characters we
|
||||
// can skip by shifting. It's a measure of how we are doing
|
||||
// compared to reading each character exactly once.
|
||||
badness += (pattern_length - j) - last_char_shift;
|
||||
if (badness > 0) {
|
||||
search->PopulateBoyerMooreTable();
|
||||
search->strategy_ = &BoyerMooreSearch;
|
||||
return BoyerMooreSearch(search, subject, index);
|
||||
}
|
||||
}
|
||||
return subject.length();
|
||||
}
|
||||
|
||||
template <typename PatternChar, typename SubjectChar>
|
||||
void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
|
||||
const size_t pattern_length = pattern_.length();
|
||||
|
||||
int* bad_char_occurrence = bad_char_table();
|
||||
|
||||
// Only preprocess at most kBMMaxShift last characters of pattern.
|
||||
const size_t start = start_;
|
||||
// Run forwards to populate bad_char_table, so that *last* instance
|
||||
// of character equivalence class is the one registered.
|
||||
// Notice: Doesn't include the last character.
|
||||
const size_t table_size = AlphabetSize();
|
||||
if (start == 0) {
|
||||
// All patterns less than kBMMaxShift in length.
|
||||
memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence));
|
||||
} else {
|
||||
for (size_t i = 0; i < table_size; i++) {
|
||||
bad_char_occurrence[i] = start - 1;
|
||||
}
|
||||
}
|
||||
for (size_t i = start; i < pattern_length - 1; i++) {
|
||||
PatternChar c = pattern_[i];
|
||||
int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
|
||||
bad_char_occurrence[bucket] = i;
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
// Linear string search with bailout to BMH.
|
||||
//---------------------------------------------------------------------
|
||||
|
||||
// Simple linear search for short patterns, which bails out if the string
|
||||
// isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
|
||||
template <typename PatternChar, typename SubjectChar>
|
||||
size_t StringSearch<PatternChar, SubjectChar>::InitialSearch(
|
||||
StringSearch<PatternChar, SubjectChar>* search,
|
||||
Vector<const SubjectChar> subject,
|
||||
size_t index) {
|
||||
Vector<const PatternChar> pattern = search->pattern_;
|
||||
const size_t pattern_length = pattern.length();
|
||||
// Badness is a count of how much work we have done. When we have
|
||||
// done enough work we decide it's probably worth switching to a better
|
||||
// algorithm.
|
||||
int64_t badness = -10 - (pattern_length << 2);
|
||||
|
||||
// We know our pattern is at least 2 characters, we cache the first so
|
||||
// the common case of the first character not matching is faster.
|
||||
for (size_t i = index, n = subject.length() - pattern_length; i <= n; i++) {
|
||||
badness++;
|
||||
if (badness <= 0) {
|
||||
i = FindFirstCharacter(pattern, subject, i);
|
||||
if (i == subject.length())
|
||||
return subject.length();
|
||||
ASSERT_LE(i, n);
|
||||
size_t j = 1;
|
||||
do {
|
||||
if (pattern[j] != subject[i + j]) {
|
||||
break;
|
||||
}
|
||||
j++;
|
||||
} while (j < pattern_length);
|
||||
if (j == pattern_length) {
|
||||
return i;
|
||||
}
|
||||
badness += j;
|
||||
} else {
|
||||
search->PopulateBoyerMooreHorspoolTable();
|
||||
search->strategy_ = &BoyerMooreHorspoolSearch;
|
||||
return BoyerMooreHorspoolSearch(search, subject, i);
|
||||
}
|
||||
}
|
||||
return subject.length();
|
||||
}
|
||||
|
||||
// Perform a a single stand-alone search.
|
||||
// If searching multiple times for the same pattern, a search
|
||||
// object should be constructed once and the Search function then called
|
||||
// for each search.
|
||||
template <typename SubjectChar, typename PatternChar>
|
||||
size_t SearchString(Vector<const SubjectChar> subject,
|
||||
Vector<const PatternChar> pattern,
|
||||
size_t start_index) {
|
||||
StringSearch<PatternChar, SubjectChar> search(pattern);
|
||||
return search.Search(subject, start_index);
|
||||
}
|
||||
}
|
||||
} // namespace node::stringsearch
|
||||
|
||||
namespace node {
|
||||
using node::stringsearch::Vector;
|
||||
|
||||
template <typename SubjectChar, typename PatternChar>
|
||||
size_t SearchString(const SubjectChar* haystack,
|
||||
size_t haystack_length,
|
||||
const PatternChar* needle,
|
||||
size_t needle_length,
|
||||
size_t start_index) {
|
||||
return node::stringsearch::SearchString(
|
||||
Vector<const SubjectChar>(haystack, haystack_length),
|
||||
Vector<const PatternChar>(needle, needle_length),
|
||||
start_index);
|
||||
}
|
||||
} // namespace node
|
||||
|
||||
#endif // SRC_STRING_SEARCH_H_
|
@ -65,6 +65,193 @@ assert.equal(b.indexOf(0x61, -Infinity), 0);
|
||||
assert.equal(b.indexOf(0x61, Infinity), -1);
|
||||
assert.equal(b.indexOf(0x0), -1);
|
||||
|
||||
// test offsets
|
||||
assert.equal(b.indexOf('d', 2), 3);
|
||||
assert.equal(b.indexOf('f', 5), 5);
|
||||
assert.equal(b.indexOf('f', -1), 5);
|
||||
assert.equal(b.indexOf('f', 6), -1);
|
||||
|
||||
assert.equal(b.indexOf(Buffer('d'), 2), 3);
|
||||
assert.equal(b.indexOf(Buffer('f'), 5), 5);
|
||||
assert.equal(b.indexOf(Buffer('f'), -1), 5);
|
||||
assert.equal(b.indexOf(Buffer('f'), 6), -1);
|
||||
|
||||
assert.equal(Buffer('ff').indexOf(Buffer('f'), 1, 'ucs2'), -1);
|
||||
|
||||
// test hex encoding
|
||||
assert.equal(
|
||||
Buffer(b.toString('hex'), 'hex')
|
||||
.indexOf('64', 0, 'hex'), 3);
|
||||
assert.equal(
|
||||
Buffer(b.toString('hex'), 'hex')
|
||||
.indexOf(Buffer('64', 'hex'), 0, 'hex'), 3);
|
||||
|
||||
// test base64 encoding
|
||||
assert.equal(
|
||||
Buffer(b.toString('base64'), 'base64')
|
||||
.indexOf('ZA==', 0, 'base64'), 3);
|
||||
assert.equal(
|
||||
Buffer(b.toString('base64'), 'base64')
|
||||
.indexOf(Buffer('ZA==', 'base64'), 0, 'base64'), 3);
|
||||
|
||||
// test ascii encoding
|
||||
assert.equal(
|
||||
Buffer(b.toString('ascii'), 'ascii')
|
||||
.indexOf('d', 0, 'ascii'), 3);
|
||||
assert.equal(
|
||||
Buffer(b.toString('ascii'), 'ascii')
|
||||
.indexOf(Buffer('d', 'ascii'), 0, 'ascii'), 3);
|
||||
|
||||
// test binary encoding
|
||||
assert.equal(
|
||||
Buffer(b.toString('binary'), 'binary')
|
||||
.indexOf('d', 0, 'binary'), 3);
|
||||
assert.equal(
|
||||
Buffer(b.toString('binary'), 'binary')
|
||||
.indexOf(Buffer('d', 'binary'), 0, 'binary'), 3);
|
||||
|
||||
|
||||
// test usc2 encoding
|
||||
var twoByteString = new Buffer('\u039a\u0391\u03a3\u03a3\u0395', 'ucs2');
|
||||
|
||||
assert.equal(8, twoByteString.indexOf('\u0395', 4, 'ucs2'));
|
||||
assert.equal(6, twoByteString.indexOf('\u03a3', -4, 'ucs2'));
|
||||
assert.equal(4, twoByteString.indexOf('\u03a3', -6, 'ucs2'));
|
||||
assert.equal(4, twoByteString.indexOf(
|
||||
new Buffer('\u03a3', 'ucs2'), -6, 'ucs2'));
|
||||
assert.equal(-1, twoByteString.indexOf('\u03a3', -2, 'ucs2'));
|
||||
|
||||
var mixedByteStringUcs2 =
|
||||
new Buffer('\u039a\u0391abc\u03a3\u03a3\u0395', 'ucs2');
|
||||
assert.equal(6, mixedByteStringUcs2.indexOf('bc', 0, 'ucs2'));
|
||||
assert.equal(10, mixedByteStringUcs2.indexOf('\u03a3', 0, 'ucs2'));
|
||||
assert.equal(-1, mixedByteStringUcs2.indexOf('\u0396', 0, 'ucs2'));
|
||||
|
||||
assert.equal(
|
||||
6, mixedByteStringUcs2.indexOf(new Buffer('bc', 'ucs2'), 0, 'ucs2'));
|
||||
assert.equal(
|
||||
10, mixedByteStringUcs2.indexOf(new Buffer('\u03a3', 'ucs2'), 0, 'ucs2'));
|
||||
assert.equal(
|
||||
-1, mixedByteStringUcs2.indexOf(new Buffer('\u0396', 'ucs2'), 0, 'ucs2'));
|
||||
|
||||
var twoByteString = new Buffer('\u039a\u0391\u03a3\u03a3\u0395', 'ucs2');
|
||||
|
||||
// Test single char pattern
|
||||
assert.equal(0, twoByteString.indexOf('\u039a', 0, 'ucs2'));
|
||||
assert.equal(2, twoByteString.indexOf('\u0391', 0, 'ucs2'), 'Alpha');
|
||||
assert.equal(4, twoByteString.indexOf('\u03a3', 0, 'ucs2'), 'First Sigma');
|
||||
assert.equal(6, twoByteString.indexOf('\u03a3', 6, 'ucs2'), 'Second Sigma');
|
||||
assert.equal(8, twoByteString.indexOf('\u0395', 0, 'ucs2'), 'Epsilon');
|
||||
assert.equal(-1, twoByteString.indexOf('\u0392', 0, 'ucs2'), 'Not beta');
|
||||
|
||||
// Test multi-char pattern
|
||||
assert.equal(
|
||||
0, twoByteString.indexOf('\u039a\u0391', 0, 'ucs2'), 'Lambda Alpha');
|
||||
assert.equal(
|
||||
2, twoByteString.indexOf('\u0391\u03a3', 0, 'ucs2'), 'Alpha Sigma');
|
||||
assert.equal(
|
||||
4, twoByteString.indexOf('\u03a3\u03a3', 0, 'ucs2'), 'Sigma Sigma');
|
||||
assert.equal(
|
||||
6, twoByteString.indexOf('\u03a3\u0395', 0, 'ucs2'), 'Sigma Epsilon');
|
||||
|
||||
var mixedByteStringUtf8 = new Buffer('\u039a\u0391abc\u03a3\u03a3\u0395');
|
||||
assert.equal(5, mixedByteStringUtf8.indexOf('bc'));
|
||||
assert.equal(5, mixedByteStringUtf8.indexOf('bc', 5));
|
||||
assert.equal(5, mixedByteStringUtf8.indexOf('bc', -8));
|
||||
assert.equal(7, mixedByteStringUtf8.indexOf('\u03a3'));
|
||||
assert.equal(-1, mixedByteStringUtf8.indexOf('\u0396'));
|
||||
|
||||
|
||||
// Test complex string indexOf algorithms. Only trigger for long strings.
|
||||
// Long string that isn't a simple repeat of a shorter string.
|
||||
var longString = 'A';
|
||||
for (var i = 66; i < 76; i++) { // from 'B' to 'K'
|
||||
longString = longString + String.fromCharCode(i) + longString;
|
||||
}
|
||||
|
||||
var longBufferString = new Buffer(longString);
|
||||
|
||||
// pattern of 15 chars, repeated every 16 chars in long
|
||||
var pattern = 'ABACABADABACABA';
|
||||
for (var i = 0; i < longBufferString.length - pattern.length; i += 7) {
|
||||
var index = longBufferString.indexOf(pattern, i);
|
||||
assert.equal((i + 15) & ~0xf, index, 'Long ABACABA...-string at index ' + i);
|
||||
}
|
||||
assert.equal(510, longBufferString.indexOf('AJABACA'), 'Long AJABACA, First J');
|
||||
assert.equal(
|
||||
1534, longBufferString.indexOf('AJABACA', 511), 'Long AJABACA, Second J');
|
||||
|
||||
pattern = 'JABACABADABACABA';
|
||||
assert.equal(
|
||||
511, longBufferString.indexOf(pattern), 'Long JABACABA..., First J');
|
||||
assert.equal(
|
||||
1535, longBufferString.indexOf(pattern, 512), 'Long JABACABA..., Second J');
|
||||
|
||||
// Search for a non-ASCII string in a pure ASCII string.
|
||||
var asciiString = new Buffer(
|
||||
'arglebargleglopglyfarglebargleglopglyfarglebargleglopglyf');
|
||||
assert.equal(-1, asciiString.indexOf('\x2061'));
|
||||
assert.equal(3, asciiString.indexOf('leb', 0));
|
||||
|
||||
// Search in string containing many non-ASCII chars.
|
||||
var allCodePoints = [];
|
||||
for (var i = 0; i < 65536; i++) allCodePoints[i] = i;
|
||||
var allCharsString = String.fromCharCode.apply(String, allCodePoints);
|
||||
var allCharsBufferUtf8 = new Buffer(allCharsString);
|
||||
var allCharsBufferUcs2 = new Buffer(allCharsString, 'ucs2');
|
||||
|
||||
// Search for string long enough to trigger complex search with ASCII pattern
|
||||
// and UC16 subject.
|
||||
assert.equal(-1, allCharsBufferUtf8.indexOf('notfound'));
|
||||
assert.equal(-1, allCharsBufferUcs2.indexOf('notfound'));
|
||||
|
||||
// Find substrings in Utf8.
|
||||
var lengths = [1, 3, 15]; // Single char, simple and complex.
|
||||
var indices = [0x5, 0x60, 0x400, 0x680, 0x7ee, 0xFF02, 0x16610, 0x2f77b];
|
||||
for (var lengthIndex = 0; lengthIndex < lengths.length; lengthIndex++) {
|
||||
for (var i = 0; i < indices.length; i++) {
|
||||
var index = indices[i];
|
||||
var length = lengths[lengthIndex];
|
||||
|
||||
if (index + length > 0x7F) {
|
||||
length = 2 * length;
|
||||
}
|
||||
|
||||
if (index + length > 0x7FF) {
|
||||
length = 3 * length;
|
||||
}
|
||||
|
||||
if (index + length > 0xFFFF) {
|
||||
length = 4 * length;
|
||||
}
|
||||
|
||||
var patternBufferUtf8 = allCharsBufferUtf8.slice(index, index + length);
|
||||
assert.equal(index, allCharsBufferUtf8.indexOf(patternBufferUtf8));
|
||||
|
||||
var patternStringUtf8 = patternBufferUtf8.toString();
|
||||
assert.equal(index, allCharsBufferUtf8.indexOf(patternStringUtf8));
|
||||
}
|
||||
}
|
||||
|
||||
// Find substrings in Usc2.
|
||||
var lengths = [2, 4, 16]; // Single char, simple and complex.
|
||||
var indices = [0x5, 0x65, 0x105, 0x205, 0x285, 0x2005, 0x2085, 0xfff0];
|
||||
for (var lengthIndex = 0; lengthIndex < lengths.length; lengthIndex++) {
|
||||
for (var i = 0; i < indices.length; i++) {
|
||||
var index = indices[i] * 2;
|
||||
var length = lengths[lengthIndex];
|
||||
|
||||
var patternBufferUcs2 =
|
||||
allCharsBufferUcs2.slice(index, index + length);
|
||||
assert.equal(
|
||||
index, allCharsBufferUcs2.indexOf(patternBufferUcs2, 0, 'ucs2'));
|
||||
|
||||
var patternStringUcs2 = patternBufferUcs2.toString('ucs2');
|
||||
assert.equal(
|
||||
index, allCharsBufferUcs2.indexOf(patternStringUcs2, 0, 'ucs2'));
|
||||
}
|
||||
}
|
||||
|
||||
assert.throws(function() {
|
||||
b.indexOf(function() { });
|
||||
});
|
||||
|
Loading…
x
Reference in New Issue
Block a user