Why do InnoDB need ReadView

之前其实一直不好理解为什么 InnoDB 多要了一个 ReadView,看完了某个回答( https://www.zhihu.com/question/40514055/answer/86935747)之后,突然理解了。

InnoDB MVCC

在系统中,有几个 txn:

  1. 事务的 DB_TRX_ID,如果是读写事务的话,会在 trx_sys_get_new_trx_id() 分配一个递增的 trx_id, 我们可以简单将其视作 start_trx_id
  2. 在事务 Commit 的时候,提交 undo 的时候,会分配 txn_no 给 Undo ( trx_serialisation_number_get -> trx_sys_get_new_trx_id() ),这个可以看作 end_ts
  3. 而每个读事务会在自己的 ReadView 中记录自己开始的时候看到的最大的 trx_no 为 m_low_limit_no。ReadView 列表按照 m_low_limit_no

InnoDB MVCC 相关的结构如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
/** The MVCC read view manager */
class MVCC {
public:
/**
Allocate and create a view.
@param view view owned by this class created for the
caller. Must be freed by calling close()
@param trx transaction creating the view */
void view_open(ReadView *&view, trx_t *trx);

/**
Close a view created by the above function.
@param view view allocated by trx_open.
@param own_mutex true if caller owns trx_sys_t::mutex */
void view_close(ReadView *&view, bool own_mutex);

/**
Release a view that is inactive but not closed. Caller must own
the trx_sys_t::mutex.
@param view View to release */
void view_release(ReadView *&view);

/** Clones the oldest view and stores it in view. No need to
call view_close(). The caller owns the view that is passed in.
It will also move the closed views from the m_views list to the
m_free list. This function is called by Purge to create it view.
@param view Preallocated view, owned by the caller */
void clone_oldest_view(ReadView *view);

/**
@return the number of active views */
ulint size() const;

/**
@return true if the view is active and valid */
static bool is_view_active(ReadView *view) {
ut_a(view != reinterpret_cast<ReadView *>(0x1));

return (view != NULL && !(intptr_t(view) & 0x1));
}

/**
Set the view creator transaction id. Note: This shouldbe set only
for views created by RW transactions. */
static void set_view_creator_trx_id(ReadView *view, trx_id_t id);

private:
/**
Validates a read view list. */
bool validate() const;

/**
Find a free view from the active list, if none found then allocate
a new view. This function will also attempt to move delete marked
views from the active list to the freed list.
@return a view to use */
inline ReadView *get_view();

/**
Get the oldest view in the system. It will also move the delete
marked read views from the views list to the freed list.
@return oldest view if found or NULL */
inline ReadView *get_oldest_view() const;
ReadView *get_view_created_by_trx_id(trx_id_t trx_id) const;

private:
/** Free views ready for reuse. */
view_list_t m_free;

/** Active and closed views, the closed views will have the
creator trx id set to TRX_ID_MAX */
view_list_t m_views;
};

MVCC 会按照 m_low_limit_no 排序,一些详细的注释在 read/read0read.cc 里面,它还有个 free-list 避免重复的的分配(RC 之类的时候)。

而 ReadView 有着如下的结构:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
class ReadView {


friend class MVCC;

private:
// Disable copying
ReadView(const ReadView &);
ReadView &operator=(const ReadView &);

private:
/** The read should not see any transaction with trx id >= this
value. In other words, this is the "high water mark". */
trx_id_t m_low_limit_id;

/** The read should see all trx ids which are strictly
smaller (<) than this value. In other words, this is the
low water mark". */
trx_id_t m_up_limit_id;

/** trx id of creating transaction, set to TRX_ID_MAX for free
views. */
trx_id_t m_creator_trx_id;

/** Set of RW transactions that was active when this snapshot
was taken */
ids_t m_ids;

/** The view does not need to see the undo logs for transactions
whose transaction number is strictly smaller (<) than this value:
they can be removed in purge if not needed by other views */
trx_id_t m_low_limit_no;

/** AC-NL-RO transaction view that has been "closed". */
bool m_closed;

typedef UT_LIST_NODE_T(ReadView) node_t;

/** List of read views in trx_sys */
byte pad1[64 - sizeof(node_t)];
node_t m_view_list;
};

那么,我们注意到:

  1. 对一个 readview, 有着 m_low_limit_idm_up_limit_id 两个水位,这两个水位可以过滤掉插入的
  2. m_creator_trx_id 代表自身,创建的内容对自己是可见的
  3. m_low_limit_no 代表创建时候的最大 id。

这个在 ReadView 创建的时候,可以看到具体的设置逻辑:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
/**
Copy the transaction ids from the source vector */

void ReadView::copy_trx_ids(const trx_ids_t &trx_ids) {
ulint size = trx_ids.size();

if (m_creator_trx_id > 0) {
ut_ad(size > 0);
--size;
}

if (size == 0) {
m_ids.clear();
return;
}

m_ids.reserve(size);
m_ids.resize(size);

ids_t::value_type *p = m_ids.data();

/* Copy all the trx_ids except the creator trx id */

if (m_creator_trx_id > 0) {
// 拷贝 自己之外的元素
/* Note: We go through all this trouble because it is
unclear whether std::vector::resize() will cause an
overhead or not. We should test this extensively and
if the vector to vector copy is fast enough then get
rid of this code and replace it with more readable
and obvious code. The code below does exactly one copy,
and filters out the creator's trx id. */

trx_ids_t::const_iterator it =
std::lower_bound(trx_ids.begin(), trx_ids.end(), m_creator_trx_id);

ut_ad(it != trx_ids.end() && *it == m_creator_trx_id);

ulint i = std::distance(trx_ids.begin(), it);
ulint n = i * sizeof(trx_ids_t::value_type);

::memmove(p, &trx_ids[0], n);

n = (trx_ids.size() - i - 1) * sizeof(trx_ids_t::value_type);

ut_ad(i + (n / sizeof(trx_ids_t::value_type)) == m_ids.size());

if (n > 0) {
::memmove(p + i, &trx_ids[i + 1], n);
}
} else {
// 拷贝所有的元素
ulint n = size * sizeof(trx_ids_t::value_type);

::memmove(p, &trx_ids[0], n);
}

m_up_limit_id = m_ids.front();

#ifdef UNIV_DEBUG
/* Assert that all transaction ids in list are active. */
for (trx_ids_t::const_iterator it = trx_ids.begin(); it != trx_ids.end();
++it) {
trx_t *trx = trx_get_rw_trx_by_id(*it);
ut_ad(trx != NULL);
ut_ad(trx->state == TRX_STATE_ACTIVE || trx->state == TRX_STATE_PREPARED);
}
#endif /* UNIV_DEBUG */
}

/**
Opens a read view where exactly the transactions serialized before this
point in time are seen in the view.
@param id Creator transaction id */

void ReadView::prepare(trx_id_t id) {
ut_ad(mutex_own(&trx_sys->mutex));

m_creator_trx_id = id;

m_low_limit_no = m_low_limit_id = m_up_limit_id = trx_sys->max_trx_id;

if (!trx_sys->rw_trx_ids.empty()) {
copy_trx_ids(trx_sys->rw_trx_ids);
} else {
m_ids.clear();
}

ut_ad(m_up_limit_id <= m_low_limit_id);

if (UT_LIST_GET_LEN(trx_sys->serialisation_list) > 0) {
const trx_t *trx;

trx = UT_LIST_GET_FIRST(trx_sys->serialisation_list);

if (trx->no < m_low_limit_no) {
m_low_limit_no = trx->no;
}
}

m_closed = false;
}

可以看到这里的设置非常巧妙。

接下来观察到一个事实,这里是根据 trx_sys->rw_trx_idstrx_sys->max_trx_id 还有 m_creator_trx_id 设置的:

  1. m_low_limit_id 设置为 trx_sys->max_trx_id ( 即 max(start-ts, end-ts))
  2. m_low_limit_no 设置为 min(min committing-txn-id, trx_sys->max_trx_id),表示已经提交的最大事务 id 或 pending 的最小事务 id,保证不会读到提交中的数据
  3. m_low_limit_idm_up_limit_id 都是对应的 Running Txn Id。m_up_limit_id 更新为 min(active-txn)

所以在检查的时候,这里有:

  1. 一条插入的边会有 DB_TRX_ID, 这个是 start_ts最大的问题是,这玩意并不是一个 commit_ts!
  2. 所以要根据 min-max 和 runing-txn 列表来判断它

我们反过来推断一下:

  1. 如果 start_ts == commit_ts -> 并不需要一个 ReadView
  2. 如果不需要这么精细的判断,比如分布式并不知道所有 runing txn 是什么,就不需要这个

TiDB / CRDB

我们回过头来看 TiDB,TiDB 是一个 Percolator 的模型,显然,在这里获得所有的 txn-id 是不现实的(我怀疑会导致提交成为瓶颈,或者打爆 PD),这个时候就需要对状态未决的事务捞状态甚至等待了。CRDB 其实也差不多。

Reference