MDEV-16188: Use in-memory PK filters built from range index scans

First phase: make optimizer choose to use filter and show it in EXPLAIN.
2018-08-16 00:24:52 +03:00 · 2018-08-16 00:24:52 +03:00 · 8d5a11122c
commit 8d5a11122c
parent befc09f002
17 changed files with 722 additions and 18 deletions
--- a/mysql-test/main/rowid_filter.test
+++ b/mysql-test/main/rowid_filter.test
@ -0,0 +1,142 @@
 --disable_warnings
 DROP DATABASE IF EXISTS dbt3_s001;
 --enable_warnings
 CREATE DATABASE dbt3_s001;
 use dbt3_s001;
 --disable_query_log
 --disable_result_log
 --disable_warnings
 --source include/dbt3_s001.inc
 --enable_warnings
 --enable_result_log
 --enable_query_log
 --echo # lineitem : {i_l_receiptdate, i_l_shipdate} -> i_l_receiptdate
 EXPLAIN SELECT *
 FROM orders JOIN lineitem ON o_orderkey=l_orderkey
 WHERE  l_shipdate BETWEEN '1997-01-01' AND '1997-02-01' AND
       l_receiptdate BETWEEN '1997-01-10' AND '1997-01-25';
 --echo # lineitem : {i_l_receiptdate, i_l_shipdate} -> i_l_receiptdate
 --echo # orders : {i_o_orderdate} -> i_o_orderdate
 EXPLAIN SELECT *
 FROM orders JOIN lineitem ON o_orderkey=l_orderkey
 WHERE  l_shipdate BETWEEN '1997-01-01' AND '1997-02-01' AND
       l_receiptdate BETWEEN '1997-01-10' AND '1997-01-25' AND
       o_orderdate > '1997-01-15';
 --echo # lineitem : {i_l_receiptdate, i_l_shipdate,
 --echo #             i_l_commitdate} -> i_l_receiptdate
 EXPLAIN SELECT *
 FROM orders JOIN lineitem ON o_orderkey=l_orderkey
 WHERE  l_shipdate BETWEEN '1997-01-01' AND '1997-02-01' AND
       l_receiptdate BETWEEN '1997-01-10' AND '1997-01-25' AND
       l_commitdate BETWEEN '1997-01-05' AND '1997-01-25';
 --echo # lineitem : {i_l_receiptdate, i_l_shipdate,
 --echo #             i_l_commitdate} -> i_l_commitdate
 EXPLAIN SELECT *
 FROM orders JOIN lineitem ON o_orderkey=l_orderkey
 WHERE  l_shipdate BETWEEN '1997-01-01' AND '1997-02-01' AND
       l_receiptdate BETWEEN '1997-01-01' AND '1997-01-25' AND
       l_commitdate BETWEEN '1997-01-15' AND '1997-01-25';
 CREATE INDEX i_l_extendedprice ON lineitem(l_extendedprice);
 --echo # lineitem : {i_l_receiptdate, i_l_shipdate, i_l_commitdate,
 --echo #             i_l_extendedprice} -> i_l_extendedprice
 EXPLAIN SELECT *
 FROM orders JOIN lineitem ON o_orderkey=l_orderkey
 WHERE  l_shipdate BETWEEN '1996-11-01' AND '1997-01-21' AND
       l_receiptdate BETWEEN '1996-11-21' AND '1997-01-25' AND
       l_commitdate BETWEEN '1996-11-25' AND '1997-01-20' AND
       l_extendedprice BETWEEN 26000 AND 27000;
 --echo # lineitem : {i_l_shipdate, i_l_extendedprice} -> i_l_shipdate
 EXPLAIN SELECT *
 FROM orders JOIN lineitem ON o_orderkey=l_orderkey
 WHERE  l_shipdate BETWEEN '1997-01-11' AND '1997-01-21' AND
       l_extendedprice BETWEEN 26000 AND 27000;
 --echo # lineitem : {i_l_shipdate, i_l_extendedprice} -> i_l_extendedprice
 --echo # intersection point in the I quadrant
 EXPLAIN SELECT *
 FROM orders JOIN lineitem ON o_orderkey=l_orderkey
 WHERE (l_shipdate BETWEEN '1997-01-11' AND '1997-01-26' OR
       l_shipdate BETWEEN '1995-02-01' AND '1995-02-14' OR
       l_shipdate BETWEEN '1994-12-12' AND '1994-12-28'
       ) AND l_extendedprice BETWEEN 26000 AND 27000;
 --echo # lineitem : {i_l_shipdate, i_l_extendedprice} -> i_l_shipdate
 --echo # parallel lines
 EXPLAIN SELECT *
 FROM orders JOIN lineitem ON o_orderkey=l_orderkey
 WHERE (l_shipdate BETWEEN '1997-01-11' AND '1997-01-26' OR
       l_shipdate BETWEEN '1995-02-01' AND '1995-02-16' OR
       l_shipdate BETWEEN '1994-12-12' AND '1994-12-27'
       ) AND l_extendedprice BETWEEN 26000 AND 27000;
 CREATE INDEX i_l_discount ON lineitem(l_discount);
 CREATE INDEX i_l_tax ON lineitem(l_tax);
 --echo # lineitem : {i_l_receiptdate, i_l_shipdate, i_l_commitdate,
 --echo #             i_l_extendedprice, i_l_discount, i_l_tax}
 --echo #             -> {i_l_extendedprice}
 EXPLAIN SELECT *
 FROM orders JOIN lineitem ON o_orderkey=l_orderkey
 WHERE  l_shipdate BETWEEN '1996-11-01' AND '1997-01-21' AND
       l_receiptdate BETWEEN '1996-11-21' AND '1997-01-25' AND
       l_commitdate BETWEEN '1996-11-25' AND '1997-01-20' AND
       l_extendedprice BETWEEN 26000 AND 27000 AND
       l_discount BETWEEN 0 AND 0.01 AND
       l_tax BETWEEN 0.03 AND 0.04;
 DROP INDEX i_l_extendedprice on lineitem;
 DROP INDEX i_l_discount on lineitem;
 DROP INDEX i_l_tax on lineitem;
 SET max_rowid_filter_size= 1024;
 --echo # lineitem : {i_l_shipdate, i_l_receiptdate, i_l_commitdate}
 --echo #             -> i_l_shipdate
 --echo #            i_l_commitdate isn't in-memory -> isn't used
 EXPLAIN SELECT *
 FROM orders JOIN lineitem ON o_orderkey=l_orderkey
 WHERE  l_shipdate BETWEEN '1996-12-28' AND '1997-01-20' AND
       l_receiptdate BETWEEN '1996-12-21' AND '1997-01-25' AND
       l_commitdate BETWEEN '1996-12-01' AND '1997-01-25';
 SET max_rowid_filter_size= DEFAULT;
 --echo # lineitem : {i_l_shipdate, i_l_commitdate} -> i_l_commitdate
 EXPLAIN SELECT *
 FROM orders JOIN lineitem ON o_orderkey=l_orderkey
 WHERE l_shipdate BETWEEN '1993-01-01' AND '1997-01-30' AND
      l_commitdate BETWEEN '1997-01-10' AND '1997-01-12';
 --echo # lineitem : {i_l_shipdate, i_l_commitdate} -> i_l_commitdate
 EXPLAIN SELECT *
 FROM orders JOIN lineitem ON o_orderkey=l_orderkey
 WHERE l_shipdate BETWEEN '1993-01-01' AND '1997-01-30' AND
      l_commitdate BETWEEN '1993-01-10' AND '1997-01-12';
 --echo # lineitem : {i_l_shipdate, i_l_commitdate, i_l_receiptdate}
 --echo #             -> i_l_receiptdate
 EXPLAIN SELECT *
 FROM orders JOIN lineitem ON o_orderkey=l_orderkey
 WHERE l_shipdate BETWEEN '1993-01-01' AND '1997-01-30' AND
      l_commitdate BETWEEN '1993-01-10' AND '1997-01-12' AND
      l_receiptdate BETWEEN '1997-01-10' AND '1997-01-12';
 --echo # lineitem : {i_l_shipdate, i_l_receiptdate} -> i_l_receiptdate
 --echo # indexes with high selectivity
 EXPLAIN SELECT *
 FROM orders JOIN lineitem ON o_orderkey=l_orderkey
 WHERE l_shipdate BETWEEN '1997-01-09' AND '1997-01-10' AND
      l_receiptdate BETWEEN '1997-01-09' AND '1997-01-10';
 DROP DATABASE dbt3_s001;
--- a/sql/CMakeLists.txt
+++ b/sql/CMakeLists.txt
@ -134,6 +134,7 @@ SET (SQL_SOURCE
               sql_sequence.cc sql_sequence.h ha_sequence.h
               sql_tvc.cc sql_tvc.h
               opt_split.cc
               rowid_filter.cc rowid_filter.h
 	       ${WSREP_SOURCES}
               table_cache.cc encryption.cc temporary_tables.cc
               proxy_protocol.cc
--- a/sql/handler.cc
+++ b/sql/handler.cc
@ -2601,6 +2601,22 @@ LEX_CSTRING *handler::engine_name()
 }
 /**
  The method returns the cost of the random I/O accesses when
  index is used.
 */
 double handler::get_io_cost(uint index, ha_rows rows, uint *length)
 {
  uint len= table->key_info[index].key_length + ref_length;
  if (index == table->s->primary_key && table->file->primary_key_is_clustered())
    len= table->s->stored_rec_length;
  double keys_per_block= (stats.block_size/2.0/len+1);
  *length= len;
  return (rows + keys_per_block-1)/ keys_per_block;
 }
 double handler::keyread_time(uint index, uint ranges, ha_rows rows)
 {
  /*
@ -2612,11 +2628,8 @@ double handler::keyread_time(uint index, uint ranges, ha_rows rows)
    engines that support that (e.g. InnoDB) may want to overwrite this method.
    The model counts in the time to read index entries from cache.
  */
-  size_t len= table->key_info[index].key_length + ref_length;
+  uint len;
-  if (index == table->s->primary_key && table->file->primary_key_is_clustered())
+  return get_io_cost(index, rows, &len) +
    len= table->s->stored_rec_length;
  double keys_per_block= (stats.block_size/2.0/len+1);
  return (rows + keys_per_block-1)/ keys_per_block +
         len*rows/(stats.block_size+1)/TIME_FOR_COMPARE ;
 }
--- a/sql/handler.h
+++ b/sql/handler.h
@ -3221,6 +3221,8 @@ public:
  */
  virtual double keyread_time(uint index, uint ranges, ha_rows rows);
  double get_io_cost(uint index, ha_rows rows, uint *length);
  virtual const key_map *keys_to_use_for_scanning() { return &key_map_empty; }
  /*
--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
@ -10515,6 +10515,7 @@ ha_rows check_quick_select(PARAM *param, uint idx, bool index_only,
  handler *file= param->table->file;
  ha_rows rows= HA_POS_ERROR;
  uint keynr= param->real_keynr[idx];
  uint length;
  DBUG_ENTER("check_quick_select");
  /* Handle cases when we don't have a valid non-empty list of range */
@ -10573,6 +10574,8 @@ ha_rows check_quick_select(PARAM *param, uint idx, bool index_only,
        MY_MIN(param->table->quick_condition_rows, rows);
      param->table->quick_rows[keynr]= rows;
      param->table->quick_costs[keynr]= cost->total_cost();
      param->table->quick_key_io[keynr]=
        file->get_io_cost(keynr, param->table->quick_rows[keynr], &length);
    }
  }
  /* Figure out if the key scan is ROR (returns rows in ROWID order) or not */
--- a/sql/rowid_filter.cc
+++ b/sql/rowid_filter.cc
@ -0,0 +1,218 @@
 #include "rowid_filter.h"
 /**
  Sets information about filter with key_numb index.
  It sets a cardinality of filter, calculates its selectivity
  and gets slope and interscept values.
 */
 void Range_filter_cost_info::init(TABLE *tab, uint key_numb)
 {
  table= tab;
  key_no= key_numb;
  cardinality= table->quick_rows[key_no];
  b= filter_io_cost() + filter_write_cost() + filter_sort_cost();
  selectivity= cardinality/((double) table->stat_records());
  a= (1 + COST_COND_EVAL)*(1 - selectivity) - lookup_cost();
  intersect_x_axis_abcissa= b/a;
 }
 /**
  @brief
    Sort available filters by their building cost in the increasing order
  @details
    The method starts sorting available filters from the first filter that
    is not defined as the best filter. If there are two filters that are
    defined as the best filters there is no need to sort other filters.
    Best filters are already sorted by their building cost and have the
    smallest bulding cost in comparison with other filters by definition.
    As the sorting method bubble sort is used.
 */
 void TABLE::sort_range_filter_cost_info_array()
 {
  if (best_filter_count == 2)
    return;
  for (uint i= best_filter_count; i < range_filter_cost_info_elements-1; i++)
  {
    for (uint j= i+1; j < range_filter_cost_info_elements; j++)
    {
      if (range_filter_cost_info[i].intersect_x_axis_abcissa >
          range_filter_cost_info[j].intersect_x_axis_abcissa)
        swap_variables(Range_filter_cost_info,
                       range_filter_cost_info[i],
                       range_filter_cost_info[j]);
    }
  }
 }
 /**
  @brief
    The method searches for the filters that can reduce the join cost the most
  @details
    The method looks through the available filters trying to choose the best
    filter and eliminate as many filters as possible.
    Filters are considered as a linear functions. The best filter is the linear
    function that intersects all other linear functions not in the I quadrant
    and has the biggest a (slope) value. This filter will reduce the partial
    join cost the most. If it is possible the second best filter is also
    chosen. The second best filter can be used if the ref access is made on
    the index of the first best filter.
    So there is no need to store all other filters except filters that
    intersect in the I quadrant. It is impossible to say on this step which
    filter is better and will give the biggest gain.
    The number of filters that can be used is stored in the
    range_filter_cost_info_elements variable.
 */
 void TABLE::prune_range_filters()
 {
  key_map pruned_filter_map;
  pruned_filter_map.clear_all();
  Range_filter_cost_info *max_slope_filters[2] = {0, 0};
  for (uint i= 0; i < range_filter_cost_info_elements; i++)
  {
    Range_filter_cost_info *filter= &range_filter_cost_info[i];
    if (filter->a < 0)
    {
      range_filter_cost_info_elements--;
      swap_variables(Range_filter_cost_info, range_filter_cost_info[i],
                     range_filter_cost_info[range_filter_cost_info_elements]);
      continue;
    }
    for (uint j= i+1; j < range_filter_cost_info_elements; j++)
    {
      Range_filter_cost_info *cand_filter= &range_filter_cost_info[j];
      double intersect_x= filter->get_intersect_x(cand_filter);
      double intersect_y= filter->get_intersect_y(intersect_x);
      if (intersect_x > 0 && intersect_y > 0)
      {
        pruned_filter_map.set_bit(cand_filter->key_no);
        pruned_filter_map.set_bit(filter->key_no);
      }
    }
    if (!pruned_filter_map.is_set(filter->key_no))
    {
      if (!max_slope_filters[0])
        max_slope_filters[0]= filter;
      else
      {
        if (!max_slope_filters[1] ||
            max_slope_filters[1]->a < filter->a)
          max_slope_filters[1]= filter;
        if (max_slope_filters[0]->a < max_slope_filters[1]->a)
          swap_variables(Range_filter_cost_info*, max_slope_filters[0],
                                                  max_slope_filters[1]);
      }
    }
  }
  for (uint i= 0; i<2; i++)
  {
    if (max_slope_filters[i])
    {
      swap_variables(Range_filter_cost_info,
                     range_filter_cost_info[i],
                     *max_slope_filters[i]);
      if (i == 0 &&
          max_slope_filters[1] == &range_filter_cost_info[0])
        max_slope_filters[1]= max_slope_filters[0];
      best_filter_count++;
      max_slope_filters[i]= &range_filter_cost_info[i];
    }
  }
  sort_range_filter_cost_info_array();
 }
 void TABLE::select_usable_range_filters(THD *thd)
 {
  uint key_no;
  key_map usable_range_filter_keys;
  usable_range_filter_keys.clear_all();
  key_map::Iterator it(quick_keys);
  while ((key_no= it++) != key_map::Iterator::BITMAP_END)
  {
    if (quick_rows[key_no] >
        thd->variables.max_rowid_filter_size/file->ref_length)
      continue;
    usable_range_filter_keys.set_bit(key_no);
  }
  if (usable_range_filter_keys.is_clear_all())
    return;
  range_filter_cost_info_elements= usable_range_filter_keys.bits_set();
  range_filter_cost_info=
    new (thd->mem_root) Range_filter_cost_info [range_filter_cost_info_elements];
  Range_filter_cost_info *curr_filter_cost_info= range_filter_cost_info;
  key_map::Iterator li(usable_range_filter_keys);
  while ((key_no= li++) != key_map::Iterator::BITMAP_END)
  {
    curr_filter_cost_info->init(this, key_no);
    curr_filter_cost_info++;
  }
  prune_range_filters();
 }
 Range_filter_cost_info
 *TABLE::best_filter_for_current_join_order(uint ref_key_no,
                                           double record_count,
                                           double records)
 {
  if (!this || range_filter_cost_info_elements == 0)
    return 0;
  double card= record_count*records;
  Range_filter_cost_info *best_filter= &range_filter_cost_info[0];
  if (card < best_filter->intersect_x_axis_abcissa)
    return 0;
  if (best_filter_count != 0)
  {
    if (best_filter->key_no == ref_key_no)
    {
      if (best_filter_count == 2)
      {
        best_filter= &range_filter_cost_info[1];
        if (card < best_filter->intersect_x_axis_abcissa)
          return 0;
        return best_filter;
      }
    }
    else
      return best_filter;
  }
  double best_filter_improvement= 0.0;
  best_filter= 0;
  for (uint i= best_filter_count; i < range_filter_cost_info_elements; i++)
  {
    Range_filter_cost_info *filter= &range_filter_cost_info[i];
    if (card < filter->intersect_x_axis_abcissa)
      break;
    if (best_filter_improvement < filter->get_filter_gain(card))
    {
      best_filter_improvement= filter->get_filter_gain(card);
      best_filter= filter;
    }
  }
  return best_filter;
 }
--- a/sql/rowid_filter.h
+++ b/sql/rowid_filter.h
@ -0,0 +1,183 @@
 #ifndef ROWID_FILTER_INCLUDED
 #define ROWID_FILTER_INCLUDED
 /**
 It makes sense to apply filters for a certain join order when the following
 inequality holds:
 #T + c4*#T > #T*sel(Fi) + c4*#T*sel(Fi) +
              I/O(Fi) + c1*#(Fi) + c2*#(Fi)*log(#(Fi)) +
              c3*#T (1),
 where #T         - the fanout of the partial join
       Fi         - a filter for the index with the number i in 
                    the key_map of available indexes for this table
       sel(Fi)    - the selectivity of the index with the number 
                    i
       c4*#T,
       c4*#T*sel(Fi) - a cost to apply available predicates
       c4         - a constant to apply available predicates
       I/O(Fi)    - a cost of the I/O accesses to Fi
       #(Fi)      - a number of estimated records that range
                    access would use
       c1*#(Fi)   - a cost to write in Fi
       c1         - a constant to write one element in Fi
       c2*#(Fi)*log(#(Fi)) - a cost to sort in Fi
       c2         - a sorting constant
       c3*(#T)    - a cost to look-up into a current partial join
       c3         - a constant to look-up into Fi
  Let's set a new variable FBCi (filter building cost for the filter with
  index i):
  FBCi = I/O(Fi) + c1*#(Fi) + c2*#(Fi)*log(#(Fi))
  It can be seen that FBCi doesn't depend on #T.
  So using this variable (1) can be rewritten:
  #T + c4*#T > #T*sel(Fi) + c4*#T*sel(Fi) +
               FBCi +
               c3*#T
  To get a possible cost improvement when a filter is used right part
  of the (1) inequality should be deducted from the left part.
  Denote it as G(#T):
  G(#T)= #T + c4*#T - (#T*sel(Fi) + c4*#T*sel(Fi) + FBCi + c3*#T) (2)
  On the prepare stage when filters are created #T value isn't known.
  To find out what filter is the best among available one for the table
  (what filter gives the biggest gain) a knowledge about linear functions
  can be used. Consider filter gain as a linear function:
  Gi(#T)= ai*#T + bi (3)
  where ai= 1+c4-c3-sel(Fi)*(1+c4),
        bi= -FBCi
  Filter gain can be interpreted as an ordinate, #T as abscissa.
  So the aim is to find the linear function that has the biggest ordinate value
  for each positive abscissa (because #T can't be negative) comparing with
  the other available functions.
  Consider two filters Fi, Fj or linear functions with a positive slope.
  To find out which linear function is better let's find their intersection
  point coordinates.
  Gi(#T0)= Gj(#T0) (using (2))=>
  #T0= (bj - bi)/(ai - aj) (using (3))
  =>
  #T0= (BCFj-BCFi)/((sel(Fj)-sel(Fi))*(1+c4))
  If put #T0 value into the (3) formula G(#T0) can be easily found.
  It can be seen that if two linear functions intersect in II, III or IV
  quadrants the linear function with a bigger slope value will always
  be better.
  If two functions intersect in the I quadrant for #T1 < #T0 a function
  with a smaller slope value will give a better gain and when #T1 > #T0
  function with a bigger slope will give better gain.
  for each #T1 > #T0  if (ai > aj) => (Gi(#T1) >= Gj(#T1))
           #T1 <= #T0 if (ai > aj) => (Gi(#T1) <= Gj(#T1))
  So both linear functions should be saved.
  Interesting cases:
  1. For Fi,Fj filters ai=aj. 
    In this case intercepts bi and bj should be compared.
    The filter with the biggest intercept will give a better result.
  2. Only one filter remains after the calculations and for some join order
     it is equal to the index that is used to access table. Therefore, this
     filter can't be used.
     In this case the gain is computed for every filter that can be constructed
     for this table.
  After information about filters is computed for each partial join order
  it is checked if the filter can be applied to the current table.
  If it gives a cost improvement it is saved as the best plan for this
  partial join.
 */
 #include "mariadb.h"
 #include "sql_class.h"
 #include "table.h"
 /* Cost to write into filter */
 #define COST_WRITE      0.01
 /* Weight factor for filter sorting */
 #define CNST_SORT       0.01
 /* Cost to evaluate condition */
 #define COST_COND_EVAL  0.2
 class QUICK_RANGE_SELECT;
 class Range_filter_cost_info : public Sql_alloc
 {
 public:
  TABLE *table;
  uint key_no;
  double cardinality;
  double b;                         // intercept of the linear function
  double a;                         // slope of the linear function
  double selectivity;
  double intersect_x_axis_abcissa;
  /**
    Filter cost functions
  */
  /* Cost to lookup into filter */
  inline double lookup_cost()
  {
    return log(cardinality)*0.01;
  }
  /* IO accesses cost to access filter */
  inline double filter_io_cost()
  { return table->quick_key_io[key_no]; }
  /* Cost to write elements in filter */
  inline double filter_write_cost()
  { return COST_WRITE*cardinality; }
  /* Cost to sort elements in filter */
  inline double filter_sort_cost()
  { 
    return CNST_SORT*cardinality*log(cardinality);
  }
  /* End of filter cost functions */
  Range_filter_cost_info() : table(0), key_no(0) {}
  void init(TABLE *tab, uint key_numb);
  inline double get_intersect_x(Range_filter_cost_info *filter)
  {
    if (a == filter->a)
      return DBL_MAX;
    return (b - filter->b)/(a - filter->a);
  }
  inline double get_intersect_y(double intersect_x)
  {
    if (intersect_x == DBL_MAX)
      return DBL_MAX;
    return intersect_x*a - b;
  }
  /**
    Get a gain that a usage of filter in some partial join order
    with the cardinaly card gives
  */
  inline double get_filter_gain(double card)
  {  return card*a - b;  }
 };
 #endif /* ROWID_FILTER_INCLUDED */
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@ -2668,12 +2668,12 @@ void THD::make_explain_field_list(List<Item> &field_list, uint8 explain_flags,
                       mem_root);
  item->maybe_null=1;
  field_list.push_back(item=new (mem_root)
-                       Item_empty_string(this, "ref",
+                       Item_empty_string(this, "ref|filter",
                                         NAME_CHAR_LEN*MAX_REF_PARTS, cs),
                       mem_root);
  item->maybe_null=1;
-  field_list.push_back(item= new (mem_root)
+  field_list.push_back(item=new (mem_root)
-                       Item_return_int(this, "rows", 10, MYSQL_TYPE_LONGLONG),
+                       Item_empty_string(this, "rows", NAME_CHAR_LEN, cs),
                       mem_root);
  if (is_analyze)
  {
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@ -727,6 +727,7 @@ typedef struct system_variables
  uint column_compression_threshold;
  uint column_compression_zlib_level;
  uint in_subquery_conversion_threshold;
  ulonglong max_rowid_filter_size;
  vers_asof_timestamp_t vers_asof_timestamp;
  ulong vers_alter_history;
--- a/sql/sql_explain.cc
+++ b/sql/sql_explain.cc
@ -1146,6 +1146,15 @@ void Explain_table_access::fill_key_len_str(String *key_len_str) const
    length= longlong10_to_str(hash_next_key.get_key_len(), buf, 10) - buf;
    key_len_str->append(buf, length);
  }
  if (key.get_filter_key_length() != (uint)-1)
  {
    char buf[64];
    size_t length;
    key_len_str->append(',');
    length= longlong10_to_str(key.get_filter_key_length(), buf, 10) - buf;
    key_len_str->append(buf, length);
  }
 }
@ -1274,12 +1283,20 @@ int Explain_table_access::print_explain(select_result_sink *output, uint8 explai
    push_string_list(thd, &item_list, ref_list, &ref_list_buf);
  /* `rows` */
  StringBuffer<64> rows_str;
  if (rows_set)
  {
    rows_str.append_ulonglong((ulonglong)rows);
    if (is_filter_set())
    {
      rows_str.append(" (");
      rows_str.append_ulonglong(filter_perc);
      rows_str.append("%)");
    }
    item_list.push_back(new (mem_root)
-                        Item_int(thd, (longlong) (ulonglong) rows,
+                        Item_string_sys(thd, rows_str.ptr(),
-                                 MY_INT64_NUM_DECIMAL_DIGITS),
+                                        rows_str.length()), mem_root);
                        mem_root);
  }
  else
    item_list.push_back(item_null, mem_root);
@ -1359,6 +1376,15 @@ int Explain_table_access::print_explain(select_result_sink *output, uint8 explai
    extra_buf.append(STRING_WITH_LEN("Using filesort"));
  }
  if (is_filter_set())
  {
    if (first)
      first= false;
    else
      extra_buf.append(STRING_WITH_LEN("; "));
    extra_buf.append(STRING_WITH_LEN("Using filter"));
  }
  item_list.push_back(new (mem_root)
                      Item_string_sys(thd, extra_buf.ptr(),
                                      extra_buf.length()),
--- a/sql/sql_explain.h
+++ b/sql/sql_explain.h
@ -583,6 +583,7 @@ class Explain_index_use : public Sql_alloc
 {
  char *key_name;
  uint key_len;
  uint key_len_for_filter;
 public:
  String_list key_parts_list;
@ -595,12 +596,16 @@ public:
  {
    key_name= NULL;
    key_len= (uint)-1;
    key_len_for_filter= (uint)-1;
  }
  bool set(MEM_ROOT *root, KEY *key_name, uint key_len_arg);
  bool set_pseudo_key(MEM_ROOT *root, const char *key_name);
  void set_filter_key_length(uint key_length_arg)
  { key_len_for_filter= key_length_arg; }
  inline const char *get_key_name() const { return key_name; }
  inline uint get_key_len() const { return key_len; }
  inline uint get_filter_key_length() const { return key_len_for_filter; }
 };
@ -689,7 +694,8 @@ public:
    cache_cond(NULL),
    pushed_index_cond(NULL),
    sjm_nest(NULL),
-    pre_join_sort(NULL)
+    pre_join_sort(NULL),
    filter_perc(UINT_MAX)
  {}
  ~Explain_table_access() { delete sjm_nest; }
@ -796,6 +802,13 @@ public:
  Exec_time_tracker op_tracker;
  Table_access_tracker jbuf_tracker;
  /**
    How many rows are left after the filter was applied
    to the initial rows count in percentages.
  */
  double filter_perc;
  inline bool is_filter_set() const { return (filter_perc != UINT_MAX); }
  int print_explain(select_result_sink *output, uint8 explain_flags, 
                    bool is_analyze,
                    uint select_id, const char *select_type,
--- a/sql/sql_priv.h
+++ b/sql/sql_priv.h
@ -233,6 +233,8 @@
 #define OPTIMIZER_SWITCH_COND_PUSHDOWN_FOR_DERIVED (1ULL << 30)
 #define OPTIMIZER_SWITCH_SPLIT_MATERIALIZED        (1ULL << 31)
 #define OPTIMIZER_SWITCH_COND_PUSHDOWN_FOR_SUBQUERY (1ULL << 32)
 #define OPTIMIZER_SWITCH_USE_ROWID_FILTER          (1ULL << 33)
 #define OPTIMIZER_SWITCH_DEFAULT   (OPTIMIZER_SWITCH_INDEX_MERGE | \
                                    OPTIMIZER_SWITCH_INDEX_MERGE_UNION | \
@ -260,7 +262,8 @@
                                    OPTIMIZER_SWITCH_ORDERBY_EQ_PROP | \
                                    OPTIMIZER_SWITCH_COND_PUSHDOWN_FOR_DERIVED | \
                                    OPTIMIZER_SWITCH_SPLIT_MATERIALIZED | \
-                                    OPTIMIZER_SWITCH_COND_PUSHDOWN_FOR_SUBQUERY)
+                                    OPTIMIZER_SWITCH_COND_PUSHDOWN_FOR_SUBQUERY |\
                                    OPTIMIZER_SWITCH_USE_ROWID_FILTER)
 /*
  Replication uses 8 bytes to store SQL_MODE in the binary log. The day you
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@ -64,6 +64,7 @@
 #include "sys_vars_shared.h"
 #include "sp_head.h"
 #include "sp_rcontext.h"
 #include "rowid_filter.h"
 /*
  A key part number that means we're using a fulltext scan.
@ -4950,6 +4951,10 @@ make_join_statistics(JOIN *join, List<TABLE_LIST> &tables_list,
        s->needed_reg=select->needed_reg;
        select->quick=0;
        impossible_range= records == 0 && s->table->reginfo.impossible_range;
        if (join->thd->lex->sql_command == SQLCOM_SELECT &&
            join->table_count > 1 &&
            optimizer_flag(join->thd, OPTIMIZER_SWITCH_USE_ROWID_FILTER))
          s->table->select_usable_range_filters(join->thd);
      }
      if (!impossible_range)
      {
@ -6744,12 +6749,14 @@ best_access_path(JOIN      *join,
  double best_time=         DBL_MAX;
  double records=           DBL_MAX;
  table_map best_ref_depends_map= 0;
  Range_filter_cost_info *best_filter= 0;
  double tmp;
  ha_rows rec;
  bool best_uses_jbuf= FALSE;
  MY_BITMAP *eq_join_set= &s->table->eq_join_set;
  KEYUSE *hj_start_key= 0;
  SplM_plan_info *spl_plan= 0;
  Range_filter_cost_info *filter= 0;
  disable_jbuf= disable_jbuf || idx == join->const_tables;  
@ -6781,6 +6788,7 @@ best_access_path(JOIN      *join,
      key_part_map found_part= 0;
      table_map found_ref= 0;
      uint key= keyuse->key;
      filter= 0;
      bool ft_key=  (keyuse->keypart == FT_KEYPART);
      /* Bitmap of keyparts where the ref access is over 'keypart=const': */
      key_part_map const_part= 0;
@ -7141,6 +7149,12 @@ best_access_path(JOIN      *join,
        loose_scan_opt.check_ref_access_part2(key, start_key, records, tmp);
      } /* not ft_key */
      filter= table->best_filter_for_current_join_order(start_key->key,
                                                        records,
                                                        record_count);
      if (filter && (filter->get_filter_gain(record_count*records) < tmp))
        tmp= tmp - filter->get_filter_gain(record_count*records);
      if (tmp + 0.0001 < best_time - records/(double) TIME_FOR_COMPARE)
      {
        best_time= tmp + records/(double) TIME_FOR_COMPARE;
@ -7149,6 +7163,7 @@ best_access_path(JOIN      *join,
        best_key= start_key;
        best_max_key_part= max_key_part;
        best_ref_depends_map= found_ref;
        best_filter= filter;
      }
    } /* for each key */
    records= best_records;
@ -7190,6 +7205,7 @@ best_access_path(JOIN      *join,
    best_key= hj_start_key;
    best_ref_depends_map= 0;
    best_uses_jbuf= TRUE;
    best_filter= 0;
   }
  /*
@ -7294,8 +7310,15 @@ best_access_path(JOIN      *join,
        tmp+= (s->records - rnd_records)/(double) TIME_FOR_COMPARE;
      }
    }
-
+    double best_records= rnd_records;
    tmp += s->startup_cost;
    filter= s->table->best_filter_for_current_join_order(MAX_KEY,
                                                         rnd_records,
                                                         record_count);
    if (filter && (filter->get_filter_gain(record_count*rnd_records) < tmp))
      tmp= tmp - filter->get_filter_gain(record_count*rnd_records);
    /*
      We estimate the cost of evaluating WHERE clause for found records
      as record_count * rnd_records / TIME_FOR_COMPARE. This cost plus
@ -7311,8 +7334,9 @@ best_access_path(JOIN      *join,
        will ensure that this will be used
      */
      best= tmp;
-      records= rnd_records;
+      records= best_records;
      best_key= 0;
      best_filter= filter;
      /* range/index_merge/ALL/index access method are "independent", so: */
      best_ref_depends_map= 0;
      best_uses_jbuf= MY_TEST(!disable_jbuf && !((s->table->map &
@ -7329,6 +7353,7 @@ best_access_path(JOIN      *join,
  pos->loosescan_picker.loosescan_key= MAX_KEY;
  pos->use_join_buffer= best_uses_jbuf;
  pos->spl_plan= spl_plan;
  pos->filter= best_filter;
  loose_scan_opt.save_to_position(s, loose_scan_pos);
@ -9429,6 +9454,7 @@ bool JOIN::inject_cond_into_where(Item *injected_cond)
 static Item * const null_ptr= NULL;
 /*
  Set up join struct according to the picked join order in
@ -9588,6 +9614,8 @@ bool JOIN::get_best_combination()
        is_hash_join_key_no(j->ref.key))
      hash_join= TRUE; 
    j->filter= best_positions[tablenr].filter;
  loop_end:
    /* 
      Save records_read in JOIN_TAB so that select_describe()/etc don't have
@ -12448,7 +12476,9 @@ ha_rows JOIN_TAB::get_examined_rows()
  double examined_rows;
  SQL_SELECT *sel= filesort? filesort->select : this->select;
-  if (sel && sel->quick && use_quick != 2)
+  if (filter)
    examined_rows= records_read;
  else if (sel && sel->quick && use_quick != 2)
    examined_rows= (double)sel->quick->records;
  else if (type == JT_NEXT || type == JT_ALL ||
           type == JT_HASH || type ==JT_HASH_NEXT)
@ -24883,6 +24913,31 @@ int append_possible_keys(MEM_ROOT *alloc, String_list &list, TABLE *table,
 }
 /**
  This method saves the data that should be printed in EXPLAIN
  if any filter was used for this table.
 */
 bool JOIN_TAB::save_filter_explain_data(Explain_table_access *eta)
 {
  if (!filter)
    return 0;
  KEY *pk_key= get_keyinfo_by_key_no(filter->key_no);
  StringBuffer<64> buff_for_pk;
  const char *tmp_buff;
  buff_for_pk.append("filter:");
  tmp_buff= pk_key->name.str;
  buff_for_pk.append(tmp_buff, strlen(tmp_buff), system_charset_info);
  if (!(eta->ref_list.append_str(join->thd->mem_root,
                                 buff_for_pk.c_ptr_safe())))
    return 1;
  eta->key.set_filter_key_length(pk_key->key_length);
  (filter->selectivity*100 >= 1) ? eta->filter_perc= round(filter->selectivity*100) :
                                   eta->filter_perc= 1;
  return 0;
 }
 bool JOIN_TAB::save_explain_data(Explain_table_access *eta,
                                 table_map prefix_tables, 
                                 bool distinct_arg, JOIN_TAB *first_top_tab)
@ -25138,6 +25193,13 @@ bool JOIN_TAB::save_explain_data(Explain_table_access *eta,
    eta->filtered= f;
  }
  if ((tab_select && tab_select->quick && tab_type != JT_CONST) ||
      (key_info && ref.key_parts && tab_type != JT_FT))
  {
    if (save_filter_explain_data(eta))
      return 1;
  }
  /* Build "Extra" field and save it */
  key_read= table->file->keyread_enabled();
  if ((tab_type == JT_NEXT || tab_type == JT_CONST) &&
--- a/sql/sql_select.h
+++ b/sql/sql_select.h
@ -506,6 +506,9 @@ typedef struct st_join_table {
  uint n_sj_tables;
  bool preread_init_done;
  /* Copy of POSITION::filter, set by get_best_combination() */
  Range_filter_cost_info *filter;
  Dynamic_array<char*> rowid_filter_pk;
  void cleanup();
  inline bool is_using_loose_index_scan()
@ -656,6 +659,7 @@ typedef struct st_join_table {
  SplM_plan_info *choose_best_splitting(double record_count,
                                        table_map remaining_tables);
  bool fix_splitting(SplM_plan_info *spl_plan, table_map remaining_tables);
  bool save_filter_explain_data(Explain_table_access *eta);
 } JOIN_TAB;
@ -881,6 +885,9 @@ public:
 };
 class Range_filter_cost_info;
 /**
  Information about a position of table within a join order. Used in join
  optimization.
@ -963,6 +970,8 @@ typedef struct st_position
  /* Info on splitting plan used at this position */  
  SplM_plan_info *spl_plan;
  /* The index for which filter can be built */
  Range_filter_cost_info *filter;
 } POSITION;
 typedef Bounds_checked_array<Item_null_result*> Item_null_array;
--- a/sql/sys_vars.cc
+++ b/sql/sys_vars.cc
@ -2493,6 +2493,7 @@ export const char *optimizer_switch_names[]=
  "condition_pushdown_for_derived",
  "split_materialized",
  "condition_pushdown_for_subquery",
  "rowid_filter",
  "default", 
  NullS
 };
@ -6113,3 +6114,10 @@ static Sys_var_enum Sys_secure_timestamp(
       "historical behavior, anyone can modify session timestamp",
       READ_ONLY GLOBAL_VAR(opt_secure_timestamp), CMD_LINE(REQUIRED_ARG),
       secure_timestamp_levels, DEFAULT(SECTIME_NO));
 static Sys_var_ulonglong Sys_max_rowid_filter_size(
       "max_rowid_filter_size",
       "The maximum number of rows that fit in memory",
       SESSION_VAR(max_rowid_filter_size), CMD_LINE(REQUIRED_ARG),
       VALID_RANGE(1024, (ulonglong)~(intptr)0), DEFAULT(128*1024),
       BLOCK_SIZE(1));
--- a/sql/table.cc
+++ b/sql/table.cc
@ -4633,6 +4633,8 @@ void TABLE::init(THD *thd, TABLE_LIST *tl)
  created= TRUE;
  cond_selectivity= 1.0;
  cond_selectivity_sampling_explain= NULL;
  best_filter_count= 0;
  range_filter_cost_info_elements= 0;
 #ifdef HAVE_REPLICATION
  /* used in RBR Triggers */
  master_had_triggers= 0;
--- a/sql/table.h
+++ b/sql/table.h
@ -55,6 +55,7 @@ class Virtual_column_info;
 class Table_triggers_list;
 class TMP_TABLE_PARAM;
 class SEQUENCE;
 class Range_filter_cost_info;
 /*
  Used to identify NESTED_JOIN structures within a join (applicable only to
@ -1201,6 +1202,8 @@ public:
  uint    quick_key_parts[MAX_KEY];
  uint    quick_n_ranges[MAX_KEY];
  /* For each key I/O access cost is stored */
  double  quick_key_io[MAX_KEY];
  /* 
    Estimate of number of records that satisfy SARGable part of the table
@ -1491,6 +1494,21 @@ public:
  void deny_splitting();
  void add_splitting_info_for_key_field(struct KEY_FIELD *key_field);
  /**
    Range filter info
  */
  /* Minimum possible #T value to apply filter*/
  uint best_filter_count;
  uint range_filter_cost_info_elements;
  Range_filter_cost_info *range_filter_cost_info;
  Range_filter_cost_info
    *best_filter_for_current_join_order(uint ref_key_no,
                                        double record_count,
                                        double records);
  void sort_range_filter_cost_info_array();
  void prune_range_filters();
  void select_usable_range_filters(THD *thd);
  /**
    System Versioning support
   */