summaryrefslogtreecommitdiffstats
path: root/src/os/bluestore/NVMEDevice.h
blob: f670e308e435602e8babaadefd27ba1352894112 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
 * Ceph - scalable distributed file system
  *
 * Copyright (C) 2015 XSky <haomai@xsky.com>
 *
 * Author: Haomai Wang <haomaiwang@gmail.com>
 *
 * This is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License version 2.1, as published by the Free Software
 * Foundation.  See file COPYING.
 *
 */

#ifndef CEPH_OS_BLUESTORE_NVMEDEVICE
#define CEPH_OS_BLUESTORE_NVMEDEVICE

#include <queue>
#include <map>
#include <limits>

// since _Static_assert introduced in c11
#define _Static_assert static_assert


#include "include/atomic.h"
#include "include/interval_set.h"
#include "common/ceph_time.h"
#include "common/Mutex.h"
#include "BlockDevice.h"

enum class IOCommand {
  READ_COMMAND,
  WRITE_COMMAND,
  FLUSH_COMMAND
};

class Task;
class PerfCounters;
class SharedDriverData;

class NVMEDevice : public BlockDevice {
  /**
   * points to pinned, physically contiguous memory region;
   * contains 4KB IDENTIFY structure for controller which is
   *  target for CONTROLLER IDENTIFY command during initialization
   */
  SharedDriverData *driver;
  string name;

  uint64_t size;
  uint64_t block_size;

  bool aio_stop;

  struct BufferedExtents {
    struct Extent {
      uint64_t x_len;
      uint64_t x_off;
      const char *data;
      uint64_t data_len;
    };
    using Offset = uint64_t;
    map<Offset, Extent> buffered_extents;
    uint64_t left_edge = std::numeric_limits<uint64_t>::max();
    uint64_t right_edge = 0;

    void verify() {
      interval_set<uint64_t> m;
      for (auto && it : buffered_extents) {
        assert(!m.intersects(it.first, it.second.x_len));
        m.insert(it.first, it.second.x_len);
      }
    }

    void insert(uint64_t off, uint64_t len, const char *data) {
      auto it = buffered_extents.lower_bound(off);
      if (it != buffered_extents.begin()) {
        --it;
        if (it->first + it->second.x_len <= off)
          ++it;
      }
      uint64_t end = off + len;
      if (off < left_edge)
        left_edge = off;
      if (end > right_edge)
        right_edge = end;
      while (it != buffered_extents.end()) {
        if (it->first >= end)
          break;
        uint64_t extent_it_end = it->first + it->second.x_len;
        assert(extent_it_end >= off);
        if (it->first <= off) {
          if (extent_it_end > end) {
            //         <-     data    ->
            // <-            it           ->
            it->second.x_len -= (extent_it_end - off);
            buffered_extents[end] = Extent{
                extent_it_end - end, it->second.x_off + it->second.x_len + len, it->second.data, it->second.data_len};
          } else {
            //         <-     data    ->
            // <-     it    ->
            assert(extent_it_end <= end);
            it->second.x_len -= (extent_it_end - off);
          }
          ++it;
        } else {
          assert(it->first > off);
          if (extent_it_end > end) {
            //  <-     data    ->
            //      <-           it          ->
            uint64_t overlap = end - it->first;
            buffered_extents[end] = Extent{
                it->second.x_len - overlap, it->second.x_off + overlap, it->second.data, it->second.data_len};
          } else {
            //  <-     data    ->
            //      <- it ->
          }
          buffered_extents.erase(it++);
        }
      }
      buffered_extents[off] = Extent{
          len, 0, data, len};

      if (0)
        verify();
    }

    void memcpy_check(char *dst, uint64_t dst_raw_len, uint64_t dst_off,
                      map<Offset, Extent>::iterator &it, uint64_t src_off, uint64_t copylen) {
      if (0) {
        assert(dst_off + copylen <= dst_raw_len);
        assert(it->second.x_off + src_off + copylen <= it->second.data_len);
      }
      memcpy(dst + dst_off, it->second.data + it->second.x_off + src_off, copylen);
    }

    uint64_t read_overlap(uint64_t off, uint64_t len, char *buf) {
      uint64_t end = off + len;
      if (end <= left_edge || off >= right_edge)
        return 0;

      uint64_t copied = 0;
      auto it = buffered_extents.lower_bound(off);
      if (it != buffered_extents.begin()) {
        --it;
        if (it->first + it->second.x_len <= off)
          ++it;
      }
      uint64_t copy_len;
      while (it != buffered_extents.end()) {
        if (it->first >= end)
          break;
        uint64_t extent_it_end = it->first + it->second.x_len;
        assert(extent_it_end >= off);
        if (it->first >= off) {
          if (extent_it_end > end) {
            //  <-     data    ->
            //      <-           it          ->
            copy_len = len - (it->first - off);
            memcpy_check(buf, len, it->first - off, it, 0, copy_len);
          } else {
            //  <-     data    ->
            //      <- it ->
            copy_len = it->second.x_len;
            memcpy_check(buf, len, it->first - off, it, 0, copy_len);
          }
        } else {
          if (extent_it_end > end) {
            //         <-     data    ->
            // <-           it          ->
            copy_len = len;
            memcpy_check(buf, len, 0, it, off - it->first, copy_len);
          } else {
            //         <-     data    ->
            // <-     it    ->
            assert(extent_it_end <= end);
            copy_len = it->first + it->second.x_len - off;
            memcpy_check(buf, len, 0, it, off - it->first, copy_len);
          }
        }
        copied += copy_len;
        ++it;
      }
      return copied;
    }

    void clear() {
      buffered_extents.clear();
      left_edge = std::numeric_limits<uint64_t>::max();
      right_edge = 0;
    }
  };
  Mutex buffer_lock;
  BufferedExtents buffered_extents;
  Task *buffered_task_head = nullptr;

  static void init();
 public:
  SharedDriverData *get_driver() { return driver; }

 public:
  aio_callback_t aio_callback;
  void *aio_callback_priv;

  NVMEDevice(CephContext* cct, aio_callback_t cb, void *cbpriv);

  bool supported_bdev_label() override { return false; }

  void aio_submit(IOContext *ioc) override;

  uint64_t get_size() const override {
    return size;
  }
  uint64_t get_block_size() const override {
    return block_size;
  }

  int read(uint64_t off, uint64_t len, bufferlist *pbl,
           IOContext *ioc,
           bool buffered) override;
  int aio_read(
    uint64_t off,
    uint64_t len,
    bufferlist *pbl,
    IOContext *ioc) override;
  int aio_write(uint64_t off, bufferlist& bl,
                IOContext *ioc,
                bool buffered) override;
  int write(uint64_t off, bufferlist& bl, bool buffered) override;
  int flush() override;
  int read_random(uint64_t off, uint64_t len, char *buf, bool buffered) override;

  // for managing buffered readers/writers
  int invalidate_cache(uint64_t off, uint64_t len) override;
  int open(const string& path) override;
  void close() override;
  int collect_metadata(string prefix, map<string,string> *pm) const override;
};

#endif