[Feature #19643] Direct primitive compare sort for Array#sort_by

In most of case `sort_by` works on primitive type.
Using `qsort_r` with function pointer is much slower than compare data directly.

I implement an intro sort which compare primitive data directly for `sort_by`.
We can even afford an O(n) type check before primitive data sort.
It still go faster.
This commit is contained in:
nekoyama32767 2023-05-20 19:40:27 +09:00 committed by GitHub
parent 892798cac8
commit 87217f26f1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
Notes: git 2023-05-20 10:40:47 +00:00
Merged: https://github.com/ruby/ruby/pull/7805

Merged-By: nobu <nobu@ruby-lang.org>
2 changed files with 247 additions and 4 deletions

View File

@ -0,0 +1,53 @@
prelude: |
array_length = 2
fixnum_array2 = array_length.times.to_a.map {rand(10000)}
float_array2 = array_length.times.to_a.map {rand(10000.0).to_f}
string_array2 = array_length.times.to_a.map {"r" * rand(1..10000)}
mix_array2 = array_length.times.to_a.map {if rand(1..100) <= 50 then rand(1..10000).to_f else rand(1..10000) end}
all_zero_array2 =array_length.times.to_a.map {0}
array_length = 10
fixnum_array10 = array_length.times.to_a.map {rand(10000)}
float_array10 = array_length.times.to_a.map {rand(10000.0).to_f}
string_array10 = array_length.times.to_a.map {"r" * rand(1..10000)}
mix_array10 = array_length.times.to_a.map {if rand(1..100) <= 50 then rand(1..10000).to_f else rand(1..10000) end}
all_zero_array10 =array_length.times.to_a.map {0}
array_length = 1000
fixnum_array1000 = array_length.times.to_a.map {rand(10000)}
float_array1000 = array_length.times.to_a.map {rand(10000.0).to_f}
string_array1000 = array_length.times.to_a.map {"r" * rand(1..10000)}
mix_array1000 = array_length.times.to_a.map {if rand(1..100) <= 50 then rand(1..10000).to_f else rand(1..10000) end}
all_zero_array1000 =array_length.times.to_a.map {0}
array_length = 100000
fixnum_array100000 = array_length.times.to_a.map {rand(10000)}
float_array100000 = array_length.times.to_a.map {rand(10000.0).to_f}
string_array100000 = array_length.times.to_a.map {"r" * rand(1..10000)}
mix_array100000 = array_length.times.to_a.map {if rand(1..100) <= 50 then rand(1..10000).to_f else rand(1..10000) end}
all_zero_array100000 =array_length.times.to_a.map {0}
benchmark:
fixnum_array2.sort_by: fixnum_array2.sort_by {|a| a}
float_array2.sort_by: float_array2.sort_by {|a| a}
string_length2.sort_by: string_array2.sort_by {|a| a.length}
mix_array2.sort_by: mix_array2.sort_by {|a| a}
all_zero2.sort_by: all_zero_array2.sort_by{|a| a}
fixnum_array10.sort_by: fixnum_array10.sort_by {|a| a}
float_array10.sort_by: float_array10.sort_by {|a| a}
string_length10.sort_by: string_array10.sort_by {|a| a.length}
mix_array10.sort_by: mix_array10.sort_by {|a| a}
all_zero10.sort_by: all_zero_array10.sort_by{|a| a}
fixnum_array1000.sort_by: fixnum_array1000.sort_by {|a| a}
float_array1000.sort_by: float_array1000.sort_by {|a| a}
string_length1000.sort_by: string_array1000.sort_by {|a| a.length}
mix_array1000.sort_by: mix_array1000.sort_by {|a| a}
all_zero1000.sort_by: all_zero_array1000.sort_by{|a| a}
fixnum_array100000.sort_by: fixnum_array100000.sort_by {|a| a}
float_array100000.sort_by: float_array100000.sort_by {|a| a}
string_length100000.sort_by: string_array100000.sort_by {|a| a.length}
mix_array100000.sort_by: mix_array100000.sort_by {|a| a}
all_zero100000.sort_by: all_zero_array100000.sort_by{|a| a}

198
enum.c
View File

@ -1334,10 +1334,12 @@ enum_sort(VALUE obj)
}
#define SORT_BY_BUFSIZE 16
#define SORT_BY_UNIFORMED(num, flo, fix) (((num&1)<<2)|((flo&1)<<1)|fix)
struct sort_by_data {
const VALUE ary;
const VALUE buf;
long n;
uint8_t n;
uint8_t primitive_uniformed;
};
static VALUE
@ -1358,6 +1360,11 @@ sort_by_i(RB_BLOCK_CALL_FUNC_ARGLIST(i, _data))
rb_raise(rb_eRuntimeError, "sort_by reentered");
}
if (data->primitive_uniformed) {
data->primitive_uniformed &= SORT_BY_UNIFORMED((FIXNUM_P(v)) || (RB_FLOAT_TYPE_P(v)),
RB_FLOAT_TYPE_P(v),
FIXNUM_P(v));
}
RARRAY_ASET(data->buf, data->n*2, v);
RARRAY_ASET(data->buf, data->n*2+1, i);
data->n++;
@ -1385,6 +1392,179 @@ sort_by_cmp(const void *ap, const void *bp, void *data)
return OPTIMIZED_CMP(a, b);
}
/*
This is parts of uniform sort
*/
#define uless rb_uniform_is_less
#define UNIFORM_SWAP(a,b)\
do{struct rb_uniform_sort_data tmp = a; a = b; b = tmp;} while(0)
struct rb_uniform_sort_data {
VALUE v;
VALUE i;
};
static inline bool
rb_uniform_is_less(VALUE a, VALUE b)
{
if (FIXNUM_P(a) && FIXNUM_P(b)) {
return (SIGNED_VALUE)a < (SIGNED_VALUE)b;
}
else if (FIXNUM_P(a)) {
RUBY_ASSERT(RB_FLOAT_TYPE_P(b));
return rb_float_cmp(b, a) > 0;
}
else {
RUBY_ASSERT(RB_FLOAT_TYPE_P(a));
return rb_float_cmp(a, b) < 0;
}
}
static inline bool
rb_uniform_is_larger(VALUE a, VALUE b)
{
if (FIXNUM_P(a) && FIXNUM_P(b)) {
return (SIGNED_VALUE)a > (SIGNED_VALUE)b;
}
else if (FIXNUM_P(a)) {
RUBY_ASSERT(RB_FLOAT_TYPE_P(b));
return rb_float_cmp(b, a) < 0;
}
else {
RUBY_ASSERT(RB_FLOAT_TYPE_P(a));
return rb_float_cmp(a, b) > 0;
}
}
#define med3_val(a,b,c) (uless(a,b)?(uless(b,c)?b:uless(c,a)?a:c):(uless(c,b)?b:uless(a,c)?a:c))
static void
rb_uniform_insertionsort_2(struct rb_uniform_sort_data* ptr_begin,
struct rb_uniform_sort_data* ptr_end)
{
if ((ptr_end - ptr_begin) < 2) return;
struct rb_uniform_sort_data tmp, *j, *k,
*index = ptr_begin+1;
for (; index < ptr_end; index++) {
tmp = *index;
j = k = index;
if (uless(tmp.v, ptr_begin->v)) {
while (ptr_begin < j) {
*j = *(--k);
j = k;
}
}
else {
while (uless(tmp.v, (--k)->v)) {
*j = *k;
j = k;
}
}
*j = tmp;
}
}
static inline void
rb_uniform_heap_down_2(struct rb_uniform_sort_data* ptr_begin,
size_t offset, size_t len)
{
size_t c;
struct rb_uniform_sort_data tmp = ptr_begin[offset];
while ((c = (offset<<1)+1) <= len) {
if (c < len && uless(ptr_begin[c].v, ptr_begin[c+1].v)) {
c++;
}
if (!uless(tmp.v, ptr_begin[c].v)) break;
ptr_begin[offset] = ptr_begin[c];
offset = c;
}
ptr_begin[offset] = tmp;
}
static void
rb_uniform_heapsort_2(struct rb_uniform_sort_data* ptr_begin,
struct rb_uniform_sort_data* ptr_end)
{
size_t n = ptr_end - ptr_begin;
if (n < 2) return;
for (size_t offset = n>>1; offset > 0;) {
rb_uniform_heap_down_2(ptr_begin, --offset, n-1);
}
for (size_t offset = n-1; offset > 0;) {
UNIFORM_SWAP(*ptr_begin, ptr_begin[offset]);
rb_uniform_heap_down_2(ptr_begin, 0, --offset);
}
}
static void
rb_uniform_quicksort_intro_2(struct rb_uniform_sort_data* ptr_begin,
struct rb_uniform_sort_data* ptr_end, size_t d)
{
if (ptr_end - ptr_begin <= 16) {
rb_uniform_insertionsort_2(ptr_begin, ptr_end);
return;
}
if (d == 0) {
rb_uniform_heapsort_2(ptr_begin, ptr_end);
return;
}
VALUE x = med3_val(ptr_begin->v,
ptr_begin[(ptr_end - ptr_begin)>>1].v,
ptr_end[-1].v);
struct rb_uniform_sort_data *i = ptr_begin;
struct rb_uniform_sort_data *j = ptr_end-1;
do {
while (uless(i->v, x)) i++;
while (uless(x, j->v)) j--;
if (i <= j) {
UNIFORM_SWAP(*i, *j);
i++;
j--;
}
} while (i <= j);
j++;
if (ptr_end - j > 1) rb_uniform_quicksort_intro_2(j, ptr_end, d-1);
if (i - ptr_begin > 1) rb_uniform_quicksort_intro_2(ptr_begin, i, d-1);
}
/**
* Direct primitive data compare sort. Implement with intro sort.
* @param[in] ptr_begin The begin address of target rb_ary's raw pointer.
* @param[in] ptr_end The end address of target rb_ary's raw pointer.
**/
static void
rb_uniform_intro_sort_2(struct rb_uniform_sort_data* ptr_begin,
struct rb_uniform_sort_data* ptr_end)
{
size_t n = ptr_end - ptr_begin;
size_t d = CHAR_BIT * sizeof(n) - nlz_intptr(n) - 1;
bool sorted_flag = true;
for (struct rb_uniform_sort_data* ptr = ptr_begin+1; ptr < ptr_end; ptr++) {
if (rb_uniform_is_larger((ptr-1)->v, (ptr)->v)) {
sorted_flag = false;
break;
}
}
if (sorted_flag) {
return;
}
rb_uniform_quicksort_intro_2(ptr_begin, ptr_end, d<<1);
}
#undef uless
/*
* call-seq:
* sort_by {|element| ... } -> array
@ -1491,6 +1671,9 @@ enum_sort_by(VALUE obj)
RB_OBJ_WRITE(memo, &data->ary, ary);
RB_OBJ_WRITE(memo, &data->buf, buf);
data->n = 0;
data->primitive_uniformed = SORT_BY_UNIFORMED((CMP_OPTIMIZABLE(FLOAT) && CMP_OPTIMIZABLE(INTEGER)),
CMP_OPTIMIZABLE(FLOAT),
CMP_OPTIMIZABLE(INTEGER));
rb_block_call(obj, id_each, 0, 0, sort_by_i, (VALUE)memo);
ary = data->ary;
buf = data->buf;
@ -1499,9 +1682,16 @@ enum_sort_by(VALUE obj)
rb_ary_concat(ary, buf);
}
if (RARRAY_LEN(ary) > 2) {
RARRAY_PTR_USE(ary, ptr,
ruby_qsort(ptr, RARRAY_LEN(ary)/2, 2*sizeof(VALUE),
sort_by_cmp, (void *)ary));
if (data->primitive_uniformed) {
RARRAY_PTR_USE(ary, ptr,
rb_uniform_intro_sort_2((struct rb_uniform_sort_data*)ptr,
(struct rb_uniform_sort_data*)(ptr + RARRAY_LEN(ary))));
}
else {
RARRAY_PTR_USE(ary, ptr,
ruby_qsort(ptr, RARRAY_LEN(ary)/2, 2*sizeof(VALUE),
sort_by_cmp, (void *)ary));
}
}
if (RBASIC(ary)->klass) {
rb_raise(rb_eRuntimeError, "sort_by reentered");