MDEV-5926, MDEV-4362 post-fixes:

- Histogram::find_bucket() should not walk off the end of the value range. - Address review feedback in Histogram::point_selectivity(): different handling for zero-width buckets, and explanations.
2014-03-27 12:30:49 +04:00 · 2014-03-27 12:30:49 +04:00 · ab061a2bb3
commit ab061a2bb3
parent dee11f9633
3 changed files with 45 additions and 23 deletions
--- a/mysql-test/r/selectivity.result
+++ b/mysql-test/r/selectivity.result
@ -1378,7 +1378,7 @@ Note	1003	select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1
 # Must not cause fp division by zero, or produce nonsense numbers:
 explain extended select * from t1 where col1 in (-1,-2,-3);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	10000	3.00	Using where
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	10000	5.94	Using where
 Warnings:
 Note	1003	select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` in (<cache>(-(1)),<cache>(-(2)),<cache>(-(3))))
 explain extended select * from t1 where col1<=-1;
--- a/mysql-test/r/selectivity_innodb.result
+++ b/mysql-test/r/selectivity_innodb.result
@ -1388,7 +1388,7 @@ Note	1003	select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1
 # Must not cause fp division by zero, or produce nonsense numbers:
 explain extended select * from t1 where col1 in (-1,-2,-3);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	10000	3.00	Using where
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	10000	5.94	Using where
 Warnings:
 Note	1003	select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` in (<cache>(-(1)),<cache>(-(2)),<cache>(-(3))))
 explain extended select * from t1 where col1<=-1;
--- a/sql/sql_statistics.h
+++ b/sql/sql_statistics.h
@ -151,6 +151,7 @@ private:
    }
    return 0;
  }
  /* Find the bucket which value 'pos' falls into. */
  uint find_bucket(double pos, bool first)
  {
@ -171,7 +172,7 @@ private:
        break;
    }
-    if (val > get_value(i))
+    if (val > get_value(i) && i < (get_width() - 1))
      i++;
    if (val == get_value(i))
@ -251,6 +252,27 @@ public:
    @return
       Expected condition selectivity (a number between 0 and 1)
    @notes 
       [re_zero_length_buckets] If a bucket with zero value-length is in the
       middle of the histogram, we will not have min==max. Example: suppose, 
       pos_value=0x12, and the histogram is:
             #n  #n+1 #n+2                 
        ... 0x10 0x12 0x12 0x14 ...
                        |
                        +------------- bucket with zero value-length
        Here, we will get min=#n+1, max=#n+2, and use the multi-bucket formula.
        The problem happens at the histogram ends. if pos_value=0, and the
        histogram is:
        0x00 0x10 ...
        then min=0, max=0. This means pos_value is contained within bucket #0,
        but on the other hand, histogram data says that the bucket has only one
        value.
  */
  double point_selectivity(double pos, double avg_sel)
@ -264,6 +286,16 @@ public:
    uint max= min;
    while (max + 1 < get_width() && get_value(max + 1) == pos_value)
      max++;
    /*
      A special case: we're looking at a single bucket, and that bucket has
      zero value-length. Use the multi-bucket formula (attempt to use
      single-bucket formula will cause divison by zero).
      For more details see [re_zero_length_buckets] above.
    */
    if (max == min && get_value(max) == ((max==0)? 0 : get_value(max-1)))
      max++;
    if (max > min)
    {
@ -302,27 +334,17 @@ public:
          (max + 1 == get_width() ?  1.0 : (get_value(max) * inv_prec_factor)) -
          (min == 0 ?  0.0 : (get_value(min-1) * inv_prec_factor));
-      if (current_bucket_width < 1e-16)
+      DBUG_ASSERT(current_bucket_width); /* We shouldn't get a one zero-width bucket */
      {
        /*
          A special case: we are at the first (or the last) bucket in the
          histogram, the bucket's value range is a singlepoint [x,x], and 
          pos_value=0 (for the first bucket) or pos_value=1 (for the last).
        */
        sel= avg_sel;
      }
      else
      {
        /*
          So:
          - each bucket has the same #rows 
          - values are unformly distributed across the [min_value,max_value] domain.
-          If a bucket has value range that's N times bigger then average, than
+      /*
-          each value will have to have N times fewer rows than average.
+        So:
-        */
+        - each bucket has the same #rows 
-        sel= avg_sel * avg_bucket_width / current_bucket_width;
+        - values are unformly distributed across the [min_value,max_value] domain.
-      }
+
        If a bucket has value range that's N times bigger then average, than
        each value will have to have N times fewer rows than average.
      */
      sel= avg_sel * avg_bucket_width / current_bucket_width;
      /*
        (Q: if we just follow this proportion we may end up in a situation