MDEV-26849: JSON Histograms: point selectivity estimates are off

.. for non-existent values. Handle this special case.
2021-10-22 19:43:19 +03:00 · 2021-10-22 19:43:19 +03:00 · 05877df472
commit 05877df472
parent f3f78bed85
8 changed files with 142 additions and 10 deletions
--- a/mysql-test/main/statistics.result
+++ b/mysql-test/main/statistics.result
@ -1885,6 +1885,7 @@ t1	id	1	17384	0.0000	4.0000	14.0000	0.15705,0.15711,0.21463,0.15705,0.15711,0.15
 explain select * from t1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	229376	
 drop table t0;
 drop table t1;
 set analyze_sample_percentage=@save_analyze_sample_percentage;
 set histogram_size=@save_histogram_size;
--- a/mysql-test/main/statistics.test
+++ b/mysql-test/main/statistics.test
@ -1019,7 +1019,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a
 from mysql.column_stats;
 explain select * from t1;
-
+drop table t0;
 drop table t1;
 set analyze_sample_percentage=@save_analyze_sample_percentage;
 set histogram_size=@save_histogram_size;
--- a/mysql-test/main/statistics_json.result
+++ b/mysql-test/main/statistics_json.result
@ -4148,6 +4148,7 @@ t1	id	1	17384	0.0000	4.0000	14.0000	{
 explain select * from t1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	229376	
 drop table t0;
 drop table t1;
 set analyze_sample_percentage=@save_analyze_sample_percentage;
 set histogram_size=@save_histogram_size;
@ -7530,3 +7531,104 @@ select c from t1 where c > '1';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	16	16.00	75.00	75.00	Using where
 drop table t1;
 #
 # MDEV-26849: JSON Histograms: point selectivity estimates are off for non-existent values
 #
 #
 create table t0(a int);
 insert into t0 (a) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
 create table t1(a int);
 insert into t1 select 100*A.a from t0 A, t0 B, t0 C;
 select a, count(*) from t1 group by a order by a;
 a	count(*)
 0	100
 100	100
 200	100
 300	100
 400	100
 500	100
 600	100
 700	100
 800	100
 900	100
 set histogram_type=json_hb, histogram_size=default;
 analyze table t1 persistent for all;
 Table	Op	Msg_type	Msg_text
 test.t1	analyze	status	Engine-independent statistics collected
 test.t1	analyze	status	OK
 select * from mysql.column_stats where table_name='t1';
 db_name	table_name	column_name	min_value	max_value	nulls_ratio	avg_length	avg_frequency	hist_size	hist_type	histogram
 test	t1	a	0	900	0.0000	4.0000	100.0000	10	JSON_HB	{
  "histogram_hb_v2": [
    {
      "start": "0",
      "size": 0.1,
      "ndv": 1
    },
    {
      "start": "100",
      "size": 0.1,
      "ndv": 1
    },
    {
      "start": "200",
      "size": 0.1,
      "ndv": 1
    },
    {
      "start": "300",
      "size": 0.1,
      "ndv": 1
    },
    {
      "start": "400",
      "size": 0.1,
      "ndv": 1
    },
    {
      "start": "500",
      "size": 0.1,
      "ndv": 1
    },
    {
      "start": "600",
      "size": 0.1,
      "ndv": 1
    },
    {
      "start": "700",
      "size": 0.1,
      "ndv": 1
    },
    {
      "start": "800",
      "size": 0.1,
      "ndv": 1
    },
    {
      "start": "900",
      "end": "900",
      "size": 0.1,
      "ndv": 1
    }
  ]
 }
 analyze select * from t1 where a=0;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	1000.00	10.00	10.00	Using where
 analyze select * from t1 where a=50;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	1000.00	0.10	0.00	Using where
 analyze select * from t1 where a=70;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	1000.00	0.10	0.00	Using where
 analyze select * from t1 where a=100;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	1000.00	10.00	10.00	Using where
 analyze select * from t1 where a=150;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	1000.00	0.10	0.00	Using where
 analyze select * from t1 where a=200;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	1000.00	10.00	10.00	Using where
 drop table t0,t1;
--- a/mysql-test/main/statistics_json.test
+++ b/mysql-test/main/statistics_json.test
@ -295,3 +295,25 @@ select c from t1 where c > '1';
 drop table t1;
 --echo #
 --echo # MDEV-26849: JSON Histograms: point selectivity estimates are off for non-existent values
 --echo #
 --echo #
 create table t0(a int);
 insert into t0 (a) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
 create table t1(a int);
 insert into t1 select 100*A.a from t0 A, t0 B, t0 C;
 select a, count(*) from t1 group by a order by a;
 set histogram_type=json_hb, histogram_size=default;
 analyze table t1 persistent for all;
 select * from mysql.column_stats where table_name='t1';
 analyze select * from t1 where a=0;
 analyze select * from t1 where a=50;
 analyze select * from t1 where a=70;
 analyze select * from t1 where a=100;
 analyze select * from t1 where a=150;
 analyze select * from t1 where a=200;
 drop table t0,t1;
--- a/sql/opt_histogram_json.cc
+++ b/sql/opt_histogram_json.cc
@ -616,7 +616,7 @@ double position_in_interval(Field *field, const  uchar *key, uint key_len,
 double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,
-                                            double avg_sel)
+                                            double avg_sel, double total_rows)
 {
  const uchar *key = endpoint->key;
  if (field->real_maybe_null())
@ -631,9 +631,11 @@ double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,
  if (buckets[idx].ndv == 1 && !equal)
  {
-    // The bucket has a single value and it doesn't match! Use the global
+    /*
-    // average.
+      The bucket has a single value and it doesn't match! Return a very
-    sel= avg_sel;
+      small value.
    */
    sel= 1.0 / total_rows;
  }
  else
  {
--- a/sql/opt_histogram_json.h
+++ b/sql/opt_histogram_json.h
@ -109,7 +109,8 @@ public:
                           ulonglong size) override;
  double point_selectivity(Field *field, key_range *endpoint,
-                           double avg_selection) override;
+                           double avg_selection,
                           double total_rows) override;
  double range_selectivity(Field *field, key_range *min_endp,
                           key_range *max_endp) override;
--- a/sql/sql_statistics.cc
+++ b/sql/sql_statistics.cc
@ -3882,7 +3882,8 @@ double get_column_range_cardinality(Field *field,
        {
          res= col_non_nulls * 
 	       hist->point_selectivity(field, min_endp,
-                                       avg_frequency / col_non_nulls);
+                                       avg_frequency / col_non_nulls,
                                       tab_records);
        }
      }
      else if (avg_frequency == 0.0)
@ -3973,7 +3974,8 @@ double get_column_range_cardinality(Field *field,
 */
 double Histogram_binary::point_selectivity(Field *field, key_range *endpoint,
-                                           double avg_sel)
+                                           double avg_sel,
                                           double total_records)
 {
  double sel;
  Column_statistics *col_stats= field->read_stats;
--- a/sql/sql_statistics.h
+++ b/sql/sql_statistics.h
@ -186,7 +186,8 @@ public:
  virtual double point_selectivity(Field *field, key_range *endpoint,
-                                   double avg_selection)=0;
+                                   double avg_selectivity,
                                   double total_rows)=0;
  virtual double range_selectivity(Field *field, key_range *min_endp,
                                   key_range *max_endp)=0;
@ -355,7 +356,8 @@ public:
    Estimate selectivity of "col=const" using a histogram
  */
  double point_selectivity(Field *field, key_range *endpoint,
-                           double avg_sel) override;
+                           double avg_sel,
                           double total_rows) override;
 };