MDEV-16188: Use in-memory PK filters built from range index scans

First phase: make optimizer choose to use filter and show it in EXPLAIN.
2018-08-16 00:24:52 +03:00 · 2018-08-16 00:24:52 +03:00 · 8d5a11122c
commit 8d5a11122c
parent befc09f002
17 changed files with 722 additions and 18 deletions
--- a/mysql-test/main/rowid_filter.test
+++ b/mysql-test/main/rowid_filter.test
@ -0,0 +1,142 @@
+--disable_warnings
+DROP DATABASE IF EXISTS dbt3_s001;
+--enable_warnings
+
+CREATE DATABASE dbt3_s001;
+
+use dbt3_s001;
+
+--disable_query_log
+--disable_result_log
+--disable_warnings
+--source include/dbt3_s001.inc
+--enable_warnings
+--enable_result_log
+--enable_query_log
+
+--echo # lineitem : {i_l_receiptdate, i_l_shipdate} -> i_l_receiptdate
+EXPLAIN SELECT *
+FROM orders JOIN lineitem ON o_orderkey=l_orderkey
+WHERE  l_shipdate BETWEEN '1997-01-01' AND '1997-02-01' AND
+       l_receiptdate BETWEEN '1997-01-10' AND '1997-01-25';
+
+--echo # lineitem : {i_l_receiptdate, i_l_shipdate} -> i_l_receiptdate
+--echo # orders : {i_o_orderdate} -> i_o_orderdate
+EXPLAIN SELECT *
+FROM orders JOIN lineitem ON o_orderkey=l_orderkey
+WHERE  l_shipdate BETWEEN '1997-01-01' AND '1997-02-01' AND
+       l_receiptdate BETWEEN '1997-01-10' AND '1997-01-25' AND
+       o_orderdate > '1997-01-15';
+
+--echo # lineitem : {i_l_receiptdate, i_l_shipdate,
+--echo #             i_l_commitdate} -> i_l_receiptdate
+EXPLAIN SELECT *
+FROM orders JOIN lineitem ON o_orderkey=l_orderkey
+WHERE  l_shipdate BETWEEN '1997-01-01' AND '1997-02-01' AND
+       l_receiptdate BETWEEN '1997-01-10' AND '1997-01-25' AND
+       l_commitdate BETWEEN '1997-01-05' AND '1997-01-25';
+
+--echo # lineitem : {i_l_receiptdate, i_l_shipdate,
+--echo #             i_l_commitdate} -> i_l_commitdate
+EXPLAIN SELECT *
+FROM orders JOIN lineitem ON o_orderkey=l_orderkey
+WHERE  l_shipdate BETWEEN '1997-01-01' AND '1997-02-01' AND
+       l_receiptdate BETWEEN '1997-01-01' AND '1997-01-25' AND
+       l_commitdate BETWEEN '1997-01-15' AND '1997-01-25';
+
+CREATE INDEX i_l_extendedprice ON lineitem(l_extendedprice);
+
+--echo # lineitem : {i_l_receiptdate, i_l_shipdate, i_l_commitdate,
+--echo #             i_l_extendedprice} -> i_l_extendedprice
+EXPLAIN SELECT *
+FROM orders JOIN lineitem ON o_orderkey=l_orderkey
+WHERE  l_shipdate BETWEEN '1996-11-01' AND '1997-01-21' AND
+       l_receiptdate BETWEEN '1996-11-21' AND '1997-01-25' AND
+       l_commitdate BETWEEN '1996-11-25' AND '1997-01-20' AND
+       l_extendedprice BETWEEN 26000 AND 27000;
+
+--echo # lineitem : {i_l_shipdate, i_l_extendedprice} -> i_l_shipdate
+EXPLAIN SELECT *
+FROM orders JOIN lineitem ON o_orderkey=l_orderkey
+WHERE  l_shipdate BETWEEN '1997-01-11' AND '1997-01-21' AND
+       l_extendedprice BETWEEN 26000 AND 27000;
+
+--echo # lineitem : {i_l_shipdate, i_l_extendedprice} -> i_l_extendedprice
+--echo # intersection point in the I quadrant
+EXPLAIN SELECT *
+FROM orders JOIN lineitem ON o_orderkey=l_orderkey
+WHERE (l_shipdate BETWEEN '1997-01-11' AND '1997-01-26' OR
+       l_shipdate BETWEEN '1995-02-01' AND '1995-02-14' OR
+       l_shipdate BETWEEN '1994-12-12' AND '1994-12-28'
+       ) AND l_extendedprice BETWEEN 26000 AND 27000;
+
+--echo # lineitem : {i_l_shipdate, i_l_extendedprice} -> i_l_shipdate
+--echo # parallel lines
+EXPLAIN SELECT *
+FROM orders JOIN lineitem ON o_orderkey=l_orderkey
+WHERE (l_shipdate BETWEEN '1997-01-11' AND '1997-01-26' OR
+       l_shipdate BETWEEN '1995-02-01' AND '1995-02-16' OR
+       l_shipdate BETWEEN '1994-12-12' AND '1994-12-27'
+       ) AND l_extendedprice BETWEEN 26000 AND 27000;
+
+
+CREATE INDEX i_l_discount ON lineitem(l_discount);
+CREATE INDEX i_l_tax ON lineitem(l_tax);
+
+--echo # lineitem : {i_l_receiptdate, i_l_shipdate, i_l_commitdate,
+--echo #             i_l_extendedprice, i_l_discount, i_l_tax}
+--echo #             -> {i_l_extendedprice}
+EXPLAIN SELECT *
+FROM orders JOIN lineitem ON o_orderkey=l_orderkey
+WHERE  l_shipdate BETWEEN '1996-11-01' AND '1997-01-21' AND
+       l_receiptdate BETWEEN '1996-11-21' AND '1997-01-25' AND
+       l_commitdate BETWEEN '1996-11-25' AND '1997-01-20' AND
+       l_extendedprice BETWEEN 26000 AND 27000 AND
+       l_discount BETWEEN 0 AND 0.01 AND
+       l_tax BETWEEN 0.03 AND 0.04;
+
+DROP INDEX i_l_extendedprice on lineitem;
+DROP INDEX i_l_discount on lineitem;
+DROP INDEX i_l_tax on lineitem;
+
+SET max_rowid_filter_size= 1024;
+
+--echo # lineitem : {i_l_shipdate, i_l_receiptdate, i_l_commitdate}
+--echo #             -> i_l_shipdate
+--echo #            i_l_commitdate isn't in-memory -> isn't used
+EXPLAIN SELECT *
+FROM orders JOIN lineitem ON o_orderkey=l_orderkey
+WHERE  l_shipdate BETWEEN '1996-12-28' AND '1997-01-20' AND
+       l_receiptdate BETWEEN '1996-12-21' AND '1997-01-25' AND
+       l_commitdate BETWEEN '1996-12-01' AND '1997-01-25';
+
+SET max_rowid_filter_size= DEFAULT;
+
+--echo # lineitem : {i_l_shipdate, i_l_commitdate} -> i_l_commitdate
+EXPLAIN SELECT *
+FROM orders JOIN lineitem ON o_orderkey=l_orderkey
+WHERE l_shipdate BETWEEN '1993-01-01' AND '1997-01-30' AND
+      l_commitdate BETWEEN '1997-01-10' AND '1997-01-12';
+
+--echo # lineitem : {i_l_shipdate, i_l_commitdate} -> i_l_commitdate
+EXPLAIN SELECT *
+FROM orders JOIN lineitem ON o_orderkey=l_orderkey
+WHERE l_shipdate BETWEEN '1993-01-01' AND '1997-01-30' AND
+      l_commitdate BETWEEN '1993-01-10' AND '1997-01-12';
+
+--echo # lineitem : {i_l_shipdate, i_l_commitdate, i_l_receiptdate}
+--echo #             -> i_l_receiptdate
+EXPLAIN SELECT *
+FROM orders JOIN lineitem ON o_orderkey=l_orderkey
+WHERE l_shipdate BETWEEN '1993-01-01' AND '1997-01-30' AND
+      l_commitdate BETWEEN '1993-01-10' AND '1997-01-12' AND
+      l_receiptdate BETWEEN '1997-01-10' AND '1997-01-12';
+
+--echo # lineitem : {i_l_shipdate, i_l_receiptdate} -> i_l_receiptdate
+--echo # indexes with high selectivity
+EXPLAIN SELECT *
+FROM orders JOIN lineitem ON o_orderkey=l_orderkey
+WHERE l_shipdate BETWEEN '1997-01-09' AND '1997-01-10' AND
+      l_receiptdate BETWEEN '1997-01-09' AND '1997-01-10';
+
+DROP DATABASE dbt3_s001;
--- a/sql/CMakeLists.txt
+++ b/sql/CMakeLists.txt
@ -134,6 +134,7 @@ SET (SQL_SOURCE
               sql_sequence.cc sql_sequence.h ha_sequence.h
               sql_tvc.cc sql_tvc.h
               opt_split.cc
+               rowid_filter.cc rowid_filter.h
 	       ${WSREP_SOURCES}
               table_cache.cc encryption.cc temporary_tables.cc
               proxy_protocol.cc
--- a/sql/handler.cc
+++ b/sql/handler.cc
@ -2601,6 +2601,22 @@ LEX_CSTRING *handler::engine_name()
 }


+/**
+  The method returns the cost of the random I/O accesses when
+  index is used.
+*/
+
+double handler::get_io_cost(uint index, ha_rows rows, uint *length)
+{
+  uint len= table->key_info[index].key_length + ref_length;
+  if (index == table->s->primary_key && table->file->primary_key_is_clustered())
+    len= table->s->stored_rec_length;
+  double keys_per_block= (stats.block_size/2.0/len+1);
+  *length= len;
+  return (rows + keys_per_block-1)/ keys_per_block;
+}
+
+
 double handler::keyread_time(uint index, uint ranges, ha_rows rows)
 {
  /*
@ -2612,11 +2628,8 @@ double handler::keyread_time(uint index, uint ranges, ha_rows rows)
    engines that support that (e.g. InnoDB) may want to overwrite this method.
    The model counts in the time to read index entries from cache.
  */
-  size_t len= table->key_info[index].key_length + ref_length;
-  if (index == table->s->primary_key && table->file->primary_key_is_clustered())
-    len= table->s->stored_rec_length;
-  double keys_per_block= (stats.block_size/2.0/len+1);
-  return (rows + keys_per_block-1)/ keys_per_block +
+  uint len;
+  return get_io_cost(index, rows, &len) +
         len*rows/(stats.block_size+1)/TIME_FOR_COMPARE ;
 }

--- a/sql/handler.h
+++ b/sql/handler.h
@ -3221,6 +3221,8 @@ public:
  */
  virtual double keyread_time(uint index, uint ranges, ha_rows rows);

+  double get_io_cost(uint index, ha_rows rows, uint *length);
+
  virtual const key_map *keys_to_use_for_scanning() { return &key_map_empty; }

  /*
--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
@ -10515,6 +10515,7 @@ ha_rows check_quick_select(PARAM *param, uint idx, bool index_only,
  handler *file= param->table->file;
  ha_rows rows= HA_POS_ERROR;
  uint keynr= param->real_keynr[idx];
+  uint length;
  DBUG_ENTER("check_quick_select");
  
  /* Handle cases when we don't have a valid non-empty list of range */
@ -10573,6 +10574,8 @@ ha_rows check_quick_select(PARAM *param, uint idx, bool index_only,
        MY_MIN(param->table->quick_condition_rows, rows);
      param->table->quick_rows[keynr]= rows;
      param->table->quick_costs[keynr]= cost->total_cost();
+      param->table->quick_key_io[keynr]=
+        file->get_io_cost(keynr, param->table->quick_rows[keynr], &length);
    }
  }
  /* Figure out if the key scan is ROR (returns rows in ROWID order) or not */
--- a/sql/rowid_filter.cc
+++ b/sql/rowid_filter.cc
@ -0,0 +1,218 @@
+#include "rowid_filter.h"
+
+
+/**
+  Sets information about filter with key_numb index.
+  It sets a cardinality of filter, calculates its selectivity
+  and gets slope and interscept values.
+*/
+
+void Range_filter_cost_info::init(TABLE *tab, uint key_numb)
+{
+  table= tab;
+  key_no= key_numb;
+  cardinality= table->quick_rows[key_no];
+  b= filter_io_cost() + filter_write_cost() + filter_sort_cost();
+  selectivity= cardinality/((double) table->stat_records());
+  a= (1 + COST_COND_EVAL)*(1 - selectivity) - lookup_cost();
+  intersect_x_axis_abcissa= b/a;
+}
+
+
+/**
+  @brief
+    Sort available filters by their building cost in the increasing order
+
+  @details
+    The method starts sorting available filters from the first filter that
+    is not defined as the best filter. If there are two filters that are
+    defined as the best filters there is no need to sort other filters.
+    Best filters are already sorted by their building cost and have the
+    smallest bulding cost in comparison with other filters by definition.
+
+    As the sorting method bubble sort is used.
+*/
+
+void TABLE::sort_range_filter_cost_info_array()
+{
+  if (best_filter_count == 2)
+    return;
+
+  for (uint i= best_filter_count; i < range_filter_cost_info_elements-1; i++)
+  {
+    for (uint j= i+1; j < range_filter_cost_info_elements; j++)
+    {
+      if (range_filter_cost_info[i].intersect_x_axis_abcissa >
+          range_filter_cost_info[j].intersect_x_axis_abcissa)
+        swap_variables(Range_filter_cost_info,
+                       range_filter_cost_info[i],
+                       range_filter_cost_info[j]);
+    }
+  }
+}
+
+
+/**
+  @brief
+    The method searches for the filters that can reduce the join cost the most
+
+  @details
+    The method looks through the available filters trying to choose the best
+    filter and eliminate as many filters as possible.
+
+    Filters are considered as a linear functions. The best filter is the linear
+    function that intersects all other linear functions not in the I quadrant
+    and has the biggest a (slope) value. This filter will reduce the partial
+    join cost the most. If it is possible the second best filter is also
+    chosen. The second best filter can be used if the ref access is made on
+    the index of the first best filter.
+
+    So there is no need to store all other filters except filters that
+    intersect in the I quadrant. It is impossible to say on this step which
+    filter is better and will give the biggest gain.
+
+    The number of filters that can be used is stored in the
+    range_filter_cost_info_elements variable.
+*/
+
+void TABLE::prune_range_filters()
+{
+  key_map pruned_filter_map;
+  pruned_filter_map.clear_all();
+  Range_filter_cost_info *max_slope_filters[2] = {0, 0};
+
+  for (uint i= 0; i < range_filter_cost_info_elements; i++)
+  {
+    Range_filter_cost_info *filter= &range_filter_cost_info[i];
+    if (filter->a < 0)
+    {
+      range_filter_cost_info_elements--;
+      swap_variables(Range_filter_cost_info, range_filter_cost_info[i],
+                     range_filter_cost_info[range_filter_cost_info_elements]);
+      continue;
+    }
+    for (uint j= i+1; j < range_filter_cost_info_elements; j++)
+    {
+      Range_filter_cost_info *cand_filter= &range_filter_cost_info[j];
+
+      double intersect_x= filter->get_intersect_x(cand_filter);
+      double intersect_y= filter->get_intersect_y(intersect_x);
+
+      if (intersect_x > 0 && intersect_y > 0)
+      {
+        pruned_filter_map.set_bit(cand_filter->key_no);
+        pruned_filter_map.set_bit(filter->key_no);
+      }
+    }
+    if (!pruned_filter_map.is_set(filter->key_no))
+    {
+      if (!max_slope_filters[0])
+        max_slope_filters[0]= filter;
+      else
+      {
+        if (!max_slope_filters[1] ||
+            max_slope_filters[1]->a < filter->a)
+          max_slope_filters[1]= filter;
+        if (max_slope_filters[0]->a < max_slope_filters[1]->a)
+          swap_variables(Range_filter_cost_info*, max_slope_filters[0],
+                                                  max_slope_filters[1]);
+      }
+    }
+  }
+
+  for (uint i= 0; i<2; i++)
+  {
+    if (max_slope_filters[i])
+    {
+      swap_variables(Range_filter_cost_info,
+                     range_filter_cost_info[i],
+                     *max_slope_filters[i]);
+      if (i == 0 &&
+          max_slope_filters[1] == &range_filter_cost_info[0])
+        max_slope_filters[1]= max_slope_filters[0];
+
+      best_filter_count++;
+      max_slope_filters[i]= &range_filter_cost_info[i];
+    }
+  }
+  sort_range_filter_cost_info_array();
+}
+
+
+void TABLE::select_usable_range_filters(THD *thd)
+{
+  uint key_no;
+  key_map usable_range_filter_keys;
+  usable_range_filter_keys.clear_all();
+  key_map::Iterator it(quick_keys);
+  while ((key_no= it++) != key_map::Iterator::BITMAP_END)
+  {
+    if (quick_rows[key_no] >
+        thd->variables.max_rowid_filter_size/file->ref_length)
+      continue;
+    usable_range_filter_keys.set_bit(key_no);
+  }
+
+  if (usable_range_filter_keys.is_clear_all())
+    return;
+
+  range_filter_cost_info_elements= usable_range_filter_keys.bits_set();
+  range_filter_cost_info=
+    new (thd->mem_root) Range_filter_cost_info [range_filter_cost_info_elements];
+  Range_filter_cost_info *curr_filter_cost_info= range_filter_cost_info;
+
+  key_map::Iterator li(usable_range_filter_keys);
+  while ((key_no= li++) != key_map::Iterator::BITMAP_END)
+  {
+    curr_filter_cost_info->init(this, key_no);
+    curr_filter_cost_info++;
+  }
+  prune_range_filters();
+}
+
+
+Range_filter_cost_info
+*TABLE::best_filter_for_current_join_order(uint ref_key_no,
+                                           double record_count,
+                                           double records)
+{
+  if (!this || range_filter_cost_info_elements == 0)
+    return 0;
+
+  double card= record_count*records;
+  Range_filter_cost_info *best_filter= &range_filter_cost_info[0];
+
+  if (card < best_filter->intersect_x_axis_abcissa)
+    return 0;
+  if (best_filter_count != 0)
+  {
+    if (best_filter->key_no == ref_key_no)
+    {
+      if (best_filter_count == 2)
+      {
+        best_filter= &range_filter_cost_info[1];
+        if (card < best_filter->intersect_x_axis_abcissa)
+          return 0;
+        return best_filter;
+      }
+    }
+    else
+      return best_filter;
+  }
+
+  double best_filter_improvement= 0.0;
+  best_filter= 0;
+
+  for (uint i= best_filter_count; i < range_filter_cost_info_elements; i++)
+  {
+    Range_filter_cost_info *filter= &range_filter_cost_info[i];
+    if (card < filter->intersect_x_axis_abcissa)
+      break;
+    if (best_filter_improvement < filter->get_filter_gain(card))
+    {
+      best_filter_improvement= filter->get_filter_gain(card);
+      best_filter= filter;
+    }
+  }
+  return best_filter;
+}
--- a/sql/rowid_filter.h
+++ b/sql/rowid_filter.h
@ -0,0 +1,183 @@
+#ifndef ROWID_FILTER_INCLUDED
+#define ROWID_FILTER_INCLUDED
+
+/**
+ It makes sense to apply filters for a certain join order when the following
+ inequality holds:
+
+ #T + c4*#T > #T*sel(Fi) + c4*#T*sel(Fi) +
+              I/O(Fi) + c1*#(Fi) + c2*#(Fi)*log(#(Fi)) +
+              c3*#T (1),
+
+ where #T         - the fanout of the partial join
+       Fi         - a filter for the index with the number i in 
+                    the key_map of available indexes for this table
+       sel(Fi)    - the selectivity of the index with the number 
+                    i
+       c4*#T,
+       c4*#T*sel(Fi) - a cost to apply available predicates
+       c4         - a constant to apply available predicates
+       I/O(Fi)    - a cost of the I/O accesses to Fi
+       #(Fi)      - a number of estimated records that range
+                    access would use
+       c1*#(Fi)   - a cost to write in Fi
+       c1         - a constant to write one element in Fi
+       c2*#(Fi)*log(#(Fi)) - a cost to sort in Fi
+       c2         - a sorting constant
+       c3*(#T)    - a cost to look-up into a current partial join
+       c3         - a constant to look-up into Fi
+
+  Let's set a new variable FBCi (filter building cost for the filter with
+  index i):
+
+  FBCi = I/O(Fi) + c1*#(Fi) + c2*#(Fi)*log(#(Fi))
+
+  It can be seen that FBCi doesn't depend on #T.
+
+  So using this variable (1) can be rewritten:
+
+  #T + c4*#T > #T*sel(Fi) + c4*#T*sel(Fi) +
+               FBCi +
+               c3*#T
+
+  To get a possible cost improvement when a filter is used right part
+  of the (1) inequality should be deducted from the left part.
+  Denote it as G(#T):
+
+  G(#T)= #T + c4*#T - (#T*sel(Fi) + c4*#T*sel(Fi) + FBCi + c3*#T) (2)
+
+  On the prepare stage when filters are created #T value isn't known.
+
+  To find out what filter is the best among available one for the table
+  (what filter gives the biggest gain) a knowledge about linear functions
+  can be used. Consider filter gain as a linear function:
+
+  Gi(#T)= ai*#T + bi (3)
+
+  where ai= 1+c4-c3-sel(Fi)*(1+c4),
+        bi= -FBCi
+
+  Filter gain can be interpreted as an ordinate, #T as abscissa.
+
+  So the aim is to find the linear function that has the biggest ordinate value
+  for each positive abscissa (because #T can't be negative) comparing with
+  the other available functions.
+
+  Consider two filters Fi, Fj or linear functions with a positive slope.
+  To find out which linear function is better let's find their intersection
+  point coordinates.
+
+  Gi(#T0)= Gj(#T0) (using (2))=>
+  #T0= (bj - bi)/(ai - aj) (using (3))
+  =>
+  #T0= (BCFj-BCFi)/((sel(Fj)-sel(Fi))*(1+c4))
+
+  If put #T0 value into the (3) formula G(#T0) can be easily found.
+
+  It can be seen that if two linear functions intersect in II, III or IV
+  quadrants the linear function with a bigger slope value will always
+  be better.
+
+  If two functions intersect in the I quadrant for #T1 < #T0 a function
+  with a smaller slope value will give a better gain and when #T1 > #T0
+  function with a bigger slope will give better gain.
+
+  for each #T1 > #T0  if (ai > aj) => (Gi(#T1) >= Gj(#T1))
+           #T1 <= #T0 if (ai > aj) => (Gi(#T1) <= Gj(#T1))
+
+  So both linear functions should be saved.
+
+  Interesting cases:
+
+  1. For Fi,Fj filters ai=aj. 
+
+    In this case intercepts bi and bj should be compared.
+    The filter with the biggest intercept will give a better result.
+
+  2. Only one filter remains after the calculations and for some join order
+     it is equal to the index that is used to access table. Therefore, this
+     filter can't be used.
+     
+     In this case the gain is computed for every filter that can be constructed
+     for this table.
+
+  After information about filters is computed for each partial join order
+  it is checked if the filter can be applied to the current table.
+  If it gives a cost improvement it is saved as the best plan for this
+  partial join.
+*/
+
+#include "mariadb.h"
+#include "sql_class.h"
+#include "table.h"
+
+/* Cost to write into filter */
+#define COST_WRITE      0.01
+/* Weight factor for filter sorting */
+#define CNST_SORT       0.01
+/* Cost to evaluate condition */
+#define COST_COND_EVAL  0.2
+
+class QUICK_RANGE_SELECT;
+
+class Range_filter_cost_info : public Sql_alloc
+{
+public:
+  TABLE *table;
+  uint key_no;
+  double cardinality;
+  double b;                         // intercept of the linear function
+  double a;                         // slope of the linear function
+  double selectivity;
+  double intersect_x_axis_abcissa;
+
+  /**
+    Filter cost functions
+  */
+  /* Cost to lookup into filter */
+  inline double lookup_cost()
+  {
+    return log(cardinality)*0.01;
+  }
+
+  /* IO accesses cost to access filter */
+  inline double filter_io_cost()
+  { return table->quick_key_io[key_no]; }
+
+  /* Cost to write elements in filter */
+  inline double filter_write_cost()
+  { return COST_WRITE*cardinality; }
+
+  /* Cost to sort elements in filter */
+  inline double filter_sort_cost()
+  { 
+    return CNST_SORT*cardinality*log(cardinality);
+  }
+  /* End of filter cost functions */
+
+  Range_filter_cost_info() : table(0), key_no(0) {}
+
+  void init(TABLE *tab, uint key_numb);
+
+  inline double get_intersect_x(Range_filter_cost_info *filter)
+  {
+    if (a == filter->a)
+      return DBL_MAX;
+    return (b - filter->b)/(a - filter->a);
+  }
+  inline double get_intersect_y(double intersect_x)
+  {
+    if (intersect_x == DBL_MAX)
+      return DBL_MAX;
+    return intersect_x*a - b;
+  }
+
+  /**
+    Get a gain that a usage of filter in some partial join order
+    with the cardinaly card gives
+  */
+  inline double get_filter_gain(double card)
+  {  return card*a - b;  }
+};
+
+#endif /* ROWID_FILTER_INCLUDED */
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@ -2668,12 +2668,12 @@ void THD::make_explain_field_list(List<Item> &field_list, uint8 explain_flags,
                       mem_root);
  item->maybe_null=1;
  field_list.push_back(item=new (mem_root)
-                       Item_empty_string(this, "ref",
+                       Item_empty_string(this, "ref|filter",
                                         NAME_CHAR_LEN*MAX_REF_PARTS, cs),
                       mem_root);
  item->maybe_null=1;
-  field_list.push_back(item= new (mem_root)
-                       Item_return_int(this, "rows", 10, MYSQL_TYPE_LONGLONG),
+  field_list.push_back(item=new (mem_root)
+                       Item_empty_string(this, "rows", NAME_CHAR_LEN, cs),
                       mem_root);
  if (is_analyze)
  {
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@ -727,6 +727,7 @@ typedef struct system_variables
  uint column_compression_threshold;
  uint column_compression_zlib_level;
  uint in_subquery_conversion_threshold;
+  ulonglong max_rowid_filter_size;

  vers_asof_timestamp_t vers_asof_timestamp;
  ulong vers_alter_history;
--- a/sql/sql_explain.cc
+++ b/sql/sql_explain.cc
@ -1146,6 +1146,15 @@ void Explain_table_access::fill_key_len_str(String *key_len_str) const
    length= longlong10_to_str(hash_next_key.get_key_len(), buf, 10) - buf;
    key_len_str->append(buf, length);
  }
+
+  if (key.get_filter_key_length() != (uint)-1)
+  {
+    char buf[64];
+    size_t length;
+    key_len_str->append(',');
+    length= longlong10_to_str(key.get_filter_key_length(), buf, 10) - buf;
+    key_len_str->append(buf, length);
+  }
 }


@ -1274,12 +1283,20 @@ int Explain_table_access::print_explain(select_result_sink *output, uint8 explai
    push_string_list(thd, &item_list, ref_list, &ref_list_buf);
 
  /* `rows` */
+  StringBuffer<64> rows_str;
  if (rows_set)
  {
+    rows_str.append_ulonglong((ulonglong)rows);
+
+    if (is_filter_set())
+    {
+      rows_str.append(" (");
+      rows_str.append_ulonglong(filter_perc);
+      rows_str.append("%)");
+    }
    item_list.push_back(new (mem_root)
-                        Item_int(thd, (longlong) (ulonglong) rows,
-                                 MY_INT64_NUM_DECIMAL_DIGITS),
-                        mem_root);
+                        Item_string_sys(thd, rows_str.ptr(),
+                                        rows_str.length()), mem_root);
  }
  else
    item_list.push_back(item_null, mem_root);
@ -1359,6 +1376,15 @@ int Explain_table_access::print_explain(select_result_sink *output, uint8 explai
    extra_buf.append(STRING_WITH_LEN("Using filesort"));
  }

+  if (is_filter_set())
+  {
+    if (first)
+      first= false;
+    else
+      extra_buf.append(STRING_WITH_LEN("; "));
+    extra_buf.append(STRING_WITH_LEN("Using filter"));
+  }
+
  item_list.push_back(new (mem_root)
                      Item_string_sys(thd, extra_buf.ptr(),
                                      extra_buf.length()),
--- a/sql/sql_explain.h
+++ b/sql/sql_explain.h
@ -583,6 +583,7 @@ class Explain_index_use : public Sql_alloc
 {
  char *key_name;
  uint key_len;
+  uint key_len_for_filter;
 public:
  String_list key_parts_list;
  
@ -595,12 +596,16 @@ public:
  {
    key_name= NULL;
    key_len= (uint)-1;
+    key_len_for_filter= (uint)-1;
  }
  bool set(MEM_ROOT *root, KEY *key_name, uint key_len_arg);
  bool set_pseudo_key(MEM_ROOT *root, const char *key_name);
+  void set_filter_key_length(uint key_length_arg)
+  { key_len_for_filter= key_length_arg; }

  inline const char *get_key_name() const { return key_name; }
  inline uint get_key_len() const { return key_len; }
+  inline uint get_filter_key_length() const { return key_len_for_filter; }
 };


@ -689,7 +694,8 @@ public:
    cache_cond(NULL),
    pushed_index_cond(NULL),
    sjm_nest(NULL),
-    pre_join_sort(NULL)
+    pre_join_sort(NULL),
+    filter_perc(UINT_MAX)
  {}
  ~Explain_table_access() { delete sjm_nest; }

@ -796,6 +802,13 @@ public:
  Exec_time_tracker op_tracker;
  Table_access_tracker jbuf_tracker;
  
+  /**
+    How many rows are left after the filter was applied
+    to the initial rows count in percentages.
+  */
+  double filter_perc;
+  inline bool is_filter_set() const { return (filter_perc != UINT_MAX); }
+
  int print_explain(select_result_sink *output, uint8 explain_flags, 
                    bool is_analyze,
                    uint select_id, const char *select_type,
--- a/sql/sql_priv.h
+++ b/sql/sql_priv.h
@ -233,6 +233,8 @@
 #define OPTIMIZER_SWITCH_COND_PUSHDOWN_FOR_DERIVED (1ULL << 30)
 #define OPTIMIZER_SWITCH_SPLIT_MATERIALIZED        (1ULL << 31)
 #define OPTIMIZER_SWITCH_COND_PUSHDOWN_FOR_SUBQUERY (1ULL << 32)
+#define OPTIMIZER_SWITCH_USE_ROWID_FILTER          (1ULL << 33)
+

 #define OPTIMIZER_SWITCH_DEFAULT   (OPTIMIZER_SWITCH_INDEX_MERGE | \
                                    OPTIMIZER_SWITCH_INDEX_MERGE_UNION | \
@ -260,7 +262,8 @@
                                    OPTIMIZER_SWITCH_ORDERBY_EQ_PROP | \
                                    OPTIMIZER_SWITCH_COND_PUSHDOWN_FOR_DERIVED | \
                                    OPTIMIZER_SWITCH_SPLIT_MATERIALIZED | \
-                                    OPTIMIZER_SWITCH_COND_PUSHDOWN_FOR_SUBQUERY)
+                                    OPTIMIZER_SWITCH_COND_PUSHDOWN_FOR_SUBQUERY |\
+                                    OPTIMIZER_SWITCH_USE_ROWID_FILTER)

 /*
  Replication uses 8 bytes to store SQL_MODE in the binary log. The day you
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@ -64,6 +64,7 @@
 #include "sys_vars_shared.h"
 #include "sp_head.h"
 #include "sp_rcontext.h"
+#include "rowid_filter.h"

 /*
  A key part number that means we're using a fulltext scan.
@ -4950,6 +4951,10 @@ make_join_statistics(JOIN *join, List<TABLE_LIST> &tables_list,
        s->needed_reg=select->needed_reg;
        select->quick=0;
        impossible_range= records == 0 && s->table->reginfo.impossible_range;
+        if (join->thd->lex->sql_command == SQLCOM_SELECT &&
+            join->table_count > 1 &&
+            optimizer_flag(join->thd, OPTIMIZER_SWITCH_USE_ROWID_FILTER))
+          s->table->select_usable_range_filters(join->thd);
      }
      if (!impossible_range)
      {
@ -6744,12 +6749,14 @@ best_access_path(JOIN      *join,
  double best_time=         DBL_MAX;
  double records=           DBL_MAX;
  table_map best_ref_depends_map= 0;
+  Range_filter_cost_info *best_filter= 0;
  double tmp;
  ha_rows rec;
  bool best_uses_jbuf= FALSE;
  MY_BITMAP *eq_join_set= &s->table->eq_join_set;
  KEYUSE *hj_start_key= 0;
  SplM_plan_info *spl_plan= 0;
+  Range_filter_cost_info *filter= 0;

  disable_jbuf= disable_jbuf || idx == join->const_tables;  

@ -6781,6 +6788,7 @@ best_access_path(JOIN      *join,
      key_part_map found_part= 0;
      table_map found_ref= 0;
      uint key= keyuse->key;
+      filter= 0;
      bool ft_key=  (keyuse->keypart == FT_KEYPART);
      /* Bitmap of keyparts where the ref access is over 'keypart=const': */
      key_part_map const_part= 0;
@ -7141,6 +7149,12 @@ best_access_path(JOIN      *join,
        loose_scan_opt.check_ref_access_part2(key, start_key, records, tmp);
      } /* not ft_key */

+      filter= table->best_filter_for_current_join_order(start_key->key,
+                                                        records,
+                                                        record_count);
+      if (filter && (filter->get_filter_gain(record_count*records) < tmp))
+        tmp= tmp - filter->get_filter_gain(record_count*records);
+
      if (tmp + 0.0001 < best_time - records/(double) TIME_FOR_COMPARE)
      {
        best_time= tmp + records/(double) TIME_FOR_COMPARE;
@ -7149,6 +7163,7 @@ best_access_path(JOIN      *join,
        best_key= start_key;
        best_max_key_part= max_key_part;
        best_ref_depends_map= found_ref;
+        best_filter= filter;
      }
    } /* for each key */
    records= best_records;
@ -7190,6 +7205,7 @@ best_access_path(JOIN      *join,
    best_key= hj_start_key;
    best_ref_depends_map= 0;
    best_uses_jbuf= TRUE;
+    best_filter= 0;
   }

  /*
@ -7294,8 +7310,15 @@ best_access_path(JOIN      *join,
        tmp+= (s->records - rnd_records)/(double) TIME_FOR_COMPARE;
      }
    }
-
+    double best_records= rnd_records;
    tmp += s->startup_cost;
+
+    filter= s->table->best_filter_for_current_join_order(MAX_KEY,
+                                                         rnd_records,
+                                                         record_count);
+    if (filter && (filter->get_filter_gain(record_count*rnd_records) < tmp))
+      tmp= tmp - filter->get_filter_gain(record_count*rnd_records);
+
    /*
      We estimate the cost of evaluating WHERE clause for found records
      as record_count * rnd_records / TIME_FOR_COMPARE. This cost plus
@ -7311,8 +7334,9 @@ best_access_path(JOIN      *join,
        will ensure that this will be used
      */
      best= tmp;
-      records= rnd_records;
+      records= best_records;
      best_key= 0;
+      best_filter= filter;
      /* range/index_merge/ALL/index access method are "independent", so: */
      best_ref_depends_map= 0;
      best_uses_jbuf= MY_TEST(!disable_jbuf && !((s->table->map &
@ -7329,6 +7353,7 @@ best_access_path(JOIN      *join,
  pos->loosescan_picker.loosescan_key= MAX_KEY;
  pos->use_join_buffer= best_uses_jbuf;
  pos->spl_plan= spl_plan;
+  pos->filter= best_filter;
   
  loose_scan_opt.save_to_position(s, loose_scan_pos);

@ -9429,6 +9454,7 @@ bool JOIN::inject_cond_into_where(Item *injected_cond)

 static Item * const null_ptr= NULL;

+
 /*
  Set up join struct according to the picked join order in
  
@ -9588,6 +9614,8 @@ bool JOIN::get_best_combination()
        is_hash_join_key_no(j->ref.key))
      hash_join= TRUE; 

+    j->filter= best_positions[tablenr].filter;
+
  loop_end:
    /* 
      Save records_read in JOIN_TAB so that select_describe()/etc don't have
@ -12448,7 +12476,9 @@ ha_rows JOIN_TAB::get_examined_rows()
  double examined_rows;
  SQL_SELECT *sel= filesort? filesort->select : this->select;

-  if (sel && sel->quick && use_quick != 2)
+  if (filter)
+    examined_rows= records_read;
+  else if (sel && sel->quick && use_quick != 2)
    examined_rows= (double)sel->quick->records;
  else if (type == JT_NEXT || type == JT_ALL ||
           type == JT_HASH || type ==JT_HASH_NEXT)
@ -24883,6 +24913,31 @@ int append_possible_keys(MEM_ROOT *alloc, String_list &list, TABLE *table,
 }


+/**
+  This method saves the data that should be printed in EXPLAIN
+  if any filter was used for this table.
+*/
+
+bool JOIN_TAB::save_filter_explain_data(Explain_table_access *eta)
+{
+  if (!filter)
+    return 0;
+  KEY *pk_key= get_keyinfo_by_key_no(filter->key_no);
+  StringBuffer<64> buff_for_pk;
+  const char *tmp_buff;
+  buff_for_pk.append("filter:");
+  tmp_buff= pk_key->name.str;
+  buff_for_pk.append(tmp_buff, strlen(tmp_buff), system_charset_info);
+  if (!(eta->ref_list.append_str(join->thd->mem_root,
+                                 buff_for_pk.c_ptr_safe())))
+    return 1;
+  eta->key.set_filter_key_length(pk_key->key_length);
+  (filter->selectivity*100 >= 1) ? eta->filter_perc= round(filter->selectivity*100) :
+                                   eta->filter_perc= 1;
+  return 0;
+}
+
+
 bool JOIN_TAB::save_explain_data(Explain_table_access *eta,
                                 table_map prefix_tables, 
                                 bool distinct_arg, JOIN_TAB *first_top_tab)
@ -25138,6 +25193,13 @@ bool JOIN_TAB::save_explain_data(Explain_table_access *eta,
    eta->filtered= f;
  }

+  if ((tab_select && tab_select->quick && tab_type != JT_CONST) ||
+      (key_info && ref.key_parts && tab_type != JT_FT))
+  {
+    if (save_filter_explain_data(eta))
+      return 1;
+  }
+
  /* Build "Extra" field and save it */
  key_read= table->file->keyread_enabled();
  if ((tab_type == JT_NEXT || tab_type == JT_CONST) &&
--- a/sql/sql_select.h
+++ b/sql/sql_select.h
@ -506,6 +506,9 @@ typedef struct st_join_table {
  uint n_sj_tables;

  bool preread_init_done;
+  /* Copy of POSITION::filter, set by get_best_combination() */
+  Range_filter_cost_info *filter;
+  Dynamic_array<char*> rowid_filter_pk;

  void cleanup();
  inline bool is_using_loose_index_scan()
@ -656,6 +659,7 @@ typedef struct st_join_table {
  SplM_plan_info *choose_best_splitting(double record_count,
                                        table_map remaining_tables);
  bool fix_splitting(SplM_plan_info *spl_plan, table_map remaining_tables);
+  bool save_filter_explain_data(Explain_table_access *eta);
 } JOIN_TAB;


@ -881,6 +885,9 @@ public:
 };


+class Range_filter_cost_info;
+
+
 /**
  Information about a position of table within a join order. Used in join
  optimization.
@ -963,6 +970,8 @@ typedef struct st_position

  /* Info on splitting plan used at this position */  
  SplM_plan_info *spl_plan;
+  /* The index for which filter can be built */
+  Range_filter_cost_info *filter;
 } POSITION;

 typedef Bounds_checked_array<Item_null_result*> Item_null_array;
--- a/sql/sys_vars.cc
+++ b/sql/sys_vars.cc
@ -2493,6 +2493,7 @@ export const char *optimizer_switch_names[]=
  "condition_pushdown_for_derived",
  "split_materialized",
  "condition_pushdown_for_subquery",
+  "rowid_filter",
  "default", 
  NullS
 };
@ -6113,3 +6114,10 @@ static Sys_var_enum Sys_secure_timestamp(
       "historical behavior, anyone can modify session timestamp",
       READ_ONLY GLOBAL_VAR(opt_secure_timestamp), CMD_LINE(REQUIRED_ARG),
       secure_timestamp_levels, DEFAULT(SECTIME_NO));
+
+static Sys_var_ulonglong Sys_max_rowid_filter_size(
+       "max_rowid_filter_size",
+       "The maximum number of rows that fit in memory",
+       SESSION_VAR(max_rowid_filter_size), CMD_LINE(REQUIRED_ARG),
+       VALID_RANGE(1024, (ulonglong)~(intptr)0), DEFAULT(128*1024),
+       BLOCK_SIZE(1));
--- a/sql/table.cc
+++ b/sql/table.cc
@ -4633,6 +4633,8 @@ void TABLE::init(THD *thd, TABLE_LIST *tl)
  created= TRUE;
  cond_selectivity= 1.0;
  cond_selectivity_sampling_explain= NULL;
+  best_filter_count= 0;
+  range_filter_cost_info_elements= 0;
 #ifdef HAVE_REPLICATION
  /* used in RBR Triggers */
  master_had_triggers= 0;
--- a/sql/table.h
+++ b/sql/table.h
@ -55,6 +55,7 @@ class Virtual_column_info;
 class Table_triggers_list;
 class TMP_TABLE_PARAM;
 class SEQUENCE;
+class Range_filter_cost_info;

 /*
  Used to identify NESTED_JOIN structures within a join (applicable only to
@ -1199,8 +1200,10 @@ public:
  */
  key_part_map  const_key_parts[MAX_KEY];

-  uint		quick_key_parts[MAX_KEY];
-  uint		quick_n_ranges[MAX_KEY];
+  uint    quick_key_parts[MAX_KEY];
+  uint    quick_n_ranges[MAX_KEY];
+  /* For each key I/O access cost is stored */
+  double  quick_key_io[MAX_KEY];

  /* 
    Estimate of number of records that satisfy SARGable part of the table
@ -1491,6 +1494,21 @@ public:
  void deny_splitting();
  void add_splitting_info_for_key_field(struct KEY_FIELD *key_field);

+
+  /**
+    Range filter info
+  */
+  /* Minimum possible #T value to apply filter*/
+  uint best_filter_count;
+  uint range_filter_cost_info_elements;
+  Range_filter_cost_info *range_filter_cost_info;
+  Range_filter_cost_info
+    *best_filter_for_current_join_order(uint ref_key_no,
+                                        double record_count,
+                                        double records);
+  void sort_range_filter_cost_info_array();
+  void prune_range_filters();
+  void select_usable_range_filters(THD *thd);
  /**
    System Versioning support
   */