| 1 | /* |
| 2 | * CDDL HEADER START |
| 3 | * |
| 4 | * The contents of this file are subject to the terms of the |
| 5 | * Common Development and Distribution License (the "License"). |
| 6 | * You may not use this file except in compliance with the License. |
| 7 | * |
| 8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| 9 | * or http://www.opensolaris.org/os/licensing. |
| 10 | * See the License for the specific language governing permissions |
| 11 | * and limitations under the License. |
| 12 | * |
| 13 | * When distributing Covered Code, include this CDDL HEADER in each |
| 14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| 15 | * If applicable, add the following below this CDDL HEADER, with the |
| 16 | * fields enclosed by brackets "[]" replaced with your own identifying |
| 17 | * information: Portions Copyright [yyyy] [name of copyright owner] |
| 18 | * |
| 19 | * CDDL HEADER END |
| 20 | */ |
| 21 | /* |
| 22 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
| 23 | * Copyright (c) 2012 by Delphix. All rights reserved. |
| 24 | * Copyright (c) 2014 Integros [integros.com] |
| 25 | */ |
| 26 | |
| 27 | /* Portions Copyright 2010 Robert Milkowski */ |
| 28 | |
| 29 | #ifndef _SYS_ZIL_H |
| 30 | #define _SYS_ZIL_H |
| 31 | |
| 32 | #include <sys/types.h> |
| 33 | #include <sys/spa.h> |
| 34 | #include <sys/zio.h> |
| 35 | #include <sys/dmu.h> |
| 36 | |
| 37 | #ifdef __cplusplus |
| 38 | extern "C" { |
| 39 | #endif |
| 40 | |
| 41 | struct dsl_pool; |
| 42 | struct dsl_dataset; |
| 43 | |
| 44 | /* |
| 45 | * Intent log format: |
| 46 | * |
| 47 | * Each objset has its own intent log. The log header (zil_header_t) |
| 48 | * for objset N's intent log is kept in the Nth object of the SPA's |
| 49 | * intent_log objset. The log header points to a chain of log blocks, |
| 50 | * each of which contains log records (i.e., transactions) followed by |
| 51 | * a log block trailer (zil_trailer_t). The format of a log record |
| 52 | * depends on the record (or transaction) type, but all records begin |
| 53 | * with a common structure that defines the type, length, and txg. |
| 54 | */ |
| 55 | |
| 56 | /* |
| 57 | * Intent log header - this on disk structure holds fields to manage |
| 58 | * the log. All fields are 64 bit to easily handle cross architectures. |
| 59 | */ |
| 60 | typedef struct { |
| 61 | uint64_t ; /* txg in which log blocks were claimed */ |
| 62 | uint64_t ; /* highest replayed sequence number */ |
| 63 | blkptr_t ; /* log chain */ |
| 64 | uint64_t ; /* highest claimed block sequence number */ |
| 65 | uint64_t ; /* header flags */ |
| 66 | uint64_t ; /* highest claimed lr sequence number */ |
| 67 | uint64_t [3]; |
| 68 | } ; |
| 69 | |
| 70 | /* |
| 71 | * zh_flags bit settings |
| 72 | */ |
| 73 | #define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */ |
| 74 | #define ZIL_CLAIM_LR_SEQ_VALID 0x2 /* zh_claim_lr_seq field is valid */ |
| 75 | |
| 76 | /* |
| 77 | * Log block chaining. |
| 78 | * |
| 79 | * Log blocks are chained together. Originally they were chained at the |
| 80 | * end of the block. For performance reasons the chain was moved to the |
| 81 | * beginning of the block which allows writes for only the data being used. |
| 82 | * The older position is supported for backwards compatability. |
| 83 | * |
| 84 | * The zio_eck_t contains a zec_cksum which for the intent log is |
| 85 | * the sequence number of this log block. A seq of 0 is invalid. |
| 86 | * The zec_cksum is checked by the SPA against the sequence |
| 87 | * number passed in the blk_cksum field of the blkptr_t |
| 88 | */ |
| 89 | typedef struct zil_chain { |
| 90 | uint64_t zc_pad; |
| 91 | blkptr_t zc_next_blk; /* next block in chain */ |
| 92 | uint64_t zc_nused; /* bytes in log block used */ |
| 93 | zio_eck_t zc_eck; /* block trailer */ |
| 94 | } zil_chain_t; |
| 95 | |
| 96 | #define ZIL_MIN_BLKSZ 4096ULL |
| 97 | |
| 98 | /* |
| 99 | * The words of a log block checksum. |
| 100 | */ |
| 101 | #define ZIL_ZC_GUID_0 0 |
| 102 | #define ZIL_ZC_GUID_1 1 |
| 103 | #define ZIL_ZC_OBJSET 2 |
| 104 | #define ZIL_ZC_SEQ 3 |
| 105 | |
| 106 | typedef enum zil_create { |
| 107 | Z_FILE, |
| 108 | Z_DIR, |
| 109 | Z_XATTRDIR, |
| 110 | } zil_create_t; |
| 111 | |
| 112 | /* |
| 113 | * size of xvattr log section. |
| 114 | * its composed of lr_attr_t + xvattr bitmap + 2 64 bit timestamps |
| 115 | * for create time and a single 64 bit integer for all of the attributes, |
| 116 | * and 4 64 bit integers (32 bytes) for the scanstamp. |
| 117 | * |
| 118 | */ |
| 119 | |
| 120 | #define ZIL_XVAT_SIZE(mapsize) \ |
| 121 | sizeof (lr_attr_t) + (sizeof (uint32_t) * (mapsize - 1)) + \ |
| 122 | (sizeof (uint64_t) * 7) |
| 123 | |
| 124 | /* |
| 125 | * Size of ACL in log. The ACE data is padded out to properly align |
| 126 | * on 8 byte boundary. |
| 127 | */ |
| 128 | |
| 129 | #define ZIL_ACE_LENGTH(x) (roundup(x, sizeof (uint64_t))) |
| 130 | |
| 131 | /* |
| 132 | * Intent log transaction types and record structures |
| 133 | */ |
| 134 | #define TX_CREATE 1 /* Create file */ |
| 135 | #define TX_MKDIR 2 /* Make directory */ |
| 136 | #define TX_MKXATTR 3 /* Make XATTR directory */ |
| 137 | #define TX_SYMLINK 4 /* Create symbolic link to a file */ |
| 138 | #define TX_REMOVE 5 /* Remove file */ |
| 139 | #define TX_RMDIR 6 /* Remove directory */ |
| 140 | #define TX_LINK 7 /* Create hard link to a file */ |
| 141 | #define TX_RENAME 8 /* Rename a file */ |
| 142 | #define TX_WRITE 9 /* File write */ |
| 143 | #define TX_TRUNCATE 10 /* Truncate a file */ |
| 144 | #define TX_SETATTR 11 /* Set file attributes */ |
| 145 | #define TX_ACL_V0 12 /* Set old formatted ACL */ |
| 146 | #define TX_ACL 13 /* Set ACL */ |
| 147 | #define TX_CREATE_ACL 14 /* create with ACL */ |
| 148 | #define TX_CREATE_ATTR 15 /* create + attrs */ |
| 149 | #define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */ |
| 150 | #define TX_MKDIR_ACL 17 /* mkdir with ACL */ |
| 151 | #define TX_MKDIR_ATTR 18 /* mkdir with attr */ |
| 152 | #define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */ |
| 153 | #define TX_WRITE2 20 /* dmu_sync EALREADY write */ |
| 154 | #define TX_MAX_TYPE 21 /* Max transaction type */ |
| 155 | |
| 156 | /* |
| 157 | * The transactions for mkdir, symlink, remove, rmdir, link, and rename |
| 158 | * may have the following bit set, indicating the original request |
| 159 | * specified case-insensitive handling of names. |
| 160 | */ |
| 161 | #define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */ |
| 162 | |
| 163 | /* |
| 164 | * Transactions for write, truncate, setattr, acl_v0, and acl can be logged |
| 165 | * out of order. For convenience in the code, all such records must have |
| 166 | * lr_foid at the same offset. |
| 167 | */ |
| 168 | #define TX_OOO(txtype) \ |
| 169 | ((txtype) == TX_WRITE || \ |
| 170 | (txtype) == TX_TRUNCATE || \ |
| 171 | (txtype) == TX_SETATTR || \ |
| 172 | (txtype) == TX_ACL_V0 || \ |
| 173 | (txtype) == TX_ACL || \ |
| 174 | (txtype) == TX_WRITE2) |
| 175 | |
| 176 | /* |
| 177 | * Format of log records. |
| 178 | * The fields are carefully defined to allow them to be aligned |
| 179 | * and sized the same on sparc & intel architectures. |
| 180 | * Each log record has a common structure at the beginning. |
| 181 | * |
| 182 | * The log record on disk (lrc_seq) holds the sequence number of all log |
| 183 | * records which is used to ensure we don't replay the same record. |
| 184 | */ |
| 185 | typedef struct { /* common log record header */ |
| 186 | uint64_t lrc_txtype; /* intent log transaction type */ |
| 187 | uint64_t lrc_reclen; /* transaction record length */ |
| 188 | uint64_t lrc_txg; /* dmu transaction group number */ |
| 189 | uint64_t lrc_seq; /* see comment above */ |
| 190 | } lr_t; |
| 191 | |
| 192 | /* |
| 193 | * Common start of all out-of-order record types (TX_OOO() above). |
| 194 | */ |
| 195 | typedef struct { |
| 196 | lr_t lr_common; /* common portion of log record */ |
| 197 | uint64_t lr_foid; /* object id */ |
| 198 | } lr_ooo_t; |
| 199 | |
| 200 | /* |
| 201 | * Handle option extended vattr attributes. |
| 202 | * |
| 203 | * Whenever new attributes are added the version number |
| 204 | * will need to be updated as will code in |
| 205 | * zfs_log.c and zfs_replay.c |
| 206 | */ |
| 207 | typedef struct { |
| 208 | uint32_t lr_attr_masksize; /* number of elements in array */ |
| 209 | uint32_t lr_attr_bitmap; /* First entry of array */ |
| 210 | /* remainder of array and any additional fields */ |
| 211 | } lr_attr_t; |
| 212 | |
| 213 | /* |
| 214 | * log record for creates without optional ACL. |
| 215 | * This log record does support optional xvattr_t attributes. |
| 216 | */ |
| 217 | typedef struct { |
| 218 | lr_t lr_common; /* common portion of log record */ |
| 219 | uint64_t lr_doid; /* object id of directory */ |
| 220 | uint64_t lr_foid; /* object id of created file object */ |
| 221 | uint64_t lr_mode; /* mode of object */ |
| 222 | uint64_t lr_uid; /* uid of object */ |
| 223 | uint64_t lr_gid; /* gid of object */ |
| 224 | uint64_t lr_gen; /* generation (txg of creation) */ |
| 225 | uint64_t lr_crtime[2]; /* creation time */ |
| 226 | uint64_t lr_rdev; /* rdev of object to create */ |
| 227 | /* name of object to create follows this */ |
| 228 | /* for symlinks, link content follows name */ |
| 229 | /* for creates with xvattr data, the name follows the xvattr info */ |
| 230 | } lr_create_t; |
| 231 | |
| 232 | /* |
| 233 | * FUID ACL record will be an array of ACEs from the original ACL. |
| 234 | * If this array includes ephemeral IDs, the record will also include |
| 235 | * an array of log-specific FUIDs to replace the ephemeral IDs. |
| 236 | * Only one copy of each unique domain will be present, so the log-specific |
| 237 | * FUIDs will use an index into a compressed domain table. On replay this |
| 238 | * information will be used to construct real FUIDs (and bypass idmap, |
| 239 | * since it may not be available). |
| 240 | */ |
| 241 | |
| 242 | /* |
| 243 | * Log record for creates with optional ACL |
| 244 | * This log record is also used for recording any FUID |
| 245 | * information needed for replaying the create. If the |
| 246 | * file doesn't have any actual ACEs then the lr_aclcnt |
| 247 | * would be zero. |
| 248 | * |
| 249 | * After lr_acl_flags, there are a lr_acl_bytes number of variable sized ace's. |
| 250 | * If create is also setting xvattr's, then acl data follows xvattr. |
| 251 | * If ACE FUIDs are needed then they will follow the xvattr_t. Following |
| 252 | * the FUIDs will be the domain table information. The FUIDs for the owner |
| 253 | * and group will be in lr_create. Name follows ACL data. |
| 254 | */ |
| 255 | typedef struct { |
| 256 | lr_create_t lr_create; /* common create portion */ |
| 257 | uint64_t lr_aclcnt; /* number of ACEs in ACL */ |
| 258 | uint64_t lr_domcnt; /* number of unique domains */ |
| 259 | uint64_t lr_fuidcnt; /* number of real fuids */ |
| 260 | uint64_t lr_acl_bytes; /* number of bytes in ACL */ |
| 261 | uint64_t lr_acl_flags; /* ACL flags */ |
| 262 | } lr_acl_create_t; |
| 263 | |
| 264 | typedef struct { |
| 265 | lr_t lr_common; /* common portion of log record */ |
| 266 | uint64_t lr_doid; /* obj id of directory */ |
| 267 | /* name of object to remove follows this */ |
| 268 | } lr_remove_t; |
| 269 | |
| 270 | typedef struct { |
| 271 | lr_t lr_common; /* common portion of log record */ |
| 272 | uint64_t lr_doid; /* obj id of directory */ |
| 273 | uint64_t lr_link_obj; /* obj id of link */ |
| 274 | /* name of object to link follows this */ |
| 275 | } lr_link_t; |
| 276 | |
| 277 | typedef struct { |
| 278 | lr_t lr_common; /* common portion of log record */ |
| 279 | uint64_t lr_sdoid; /* obj id of source directory */ |
| 280 | uint64_t lr_tdoid; /* obj id of target directory */ |
| 281 | /* 2 strings: names of source and destination follow this */ |
| 282 | } lr_rename_t; |
| 283 | |
| 284 | typedef struct { |
| 285 | lr_t lr_common; /* common portion of log record */ |
| 286 | uint64_t lr_foid; /* file object to write */ |
| 287 | uint64_t lr_offset; /* offset to write to */ |
| 288 | uint64_t lr_length; /* user data length to write */ |
| 289 | uint64_t lr_blkoff; /* no longer used */ |
| 290 | blkptr_t lr_blkptr; /* spa block pointer for replay */ |
| 291 | /* write data will follow for small writes */ |
| 292 | } lr_write_t; |
| 293 | |
| 294 | typedef struct { |
| 295 | lr_t lr_common; /* common portion of log record */ |
| 296 | uint64_t lr_foid; /* object id of file to truncate */ |
| 297 | uint64_t lr_offset; /* offset to truncate from */ |
| 298 | uint64_t lr_length; /* length to truncate */ |
| 299 | } lr_truncate_t; |
| 300 | |
| 301 | typedef struct { |
| 302 | lr_t lr_common; /* common portion of log record */ |
| 303 | uint64_t lr_foid; /* file object to change attributes */ |
| 304 | uint64_t lr_mask; /* mask of attributes to set */ |
| 305 | uint64_t lr_mode; /* mode to set */ |
| 306 | uint64_t lr_uid; /* uid to set */ |
| 307 | uint64_t lr_gid; /* gid to set */ |
| 308 | uint64_t lr_size; /* size to set */ |
| 309 | uint64_t lr_atime[2]; /* access time */ |
| 310 | uint64_t lr_mtime[2]; /* modification time */ |
| 311 | /* optional attribute lr_attr_t may be here */ |
| 312 | } lr_setattr_t; |
| 313 | |
| 314 | typedef struct { |
| 315 | lr_t lr_common; /* common portion of log record */ |
| 316 | uint64_t lr_foid; /* obj id of file */ |
| 317 | uint64_t lr_aclcnt; /* number of acl entries */ |
| 318 | /* lr_aclcnt number of ace_t entries follow this */ |
| 319 | } lr_acl_v0_t; |
| 320 | |
| 321 | typedef struct { |
| 322 | lr_t lr_common; /* common portion of log record */ |
| 323 | uint64_t lr_foid; /* obj id of file */ |
| 324 | uint64_t lr_aclcnt; /* number of ACEs in ACL */ |
| 325 | uint64_t lr_domcnt; /* number of unique domains */ |
| 326 | uint64_t lr_fuidcnt; /* number of real fuids */ |
| 327 | uint64_t lr_acl_bytes; /* number of bytes in ACL */ |
| 328 | uint64_t lr_acl_flags; /* ACL flags */ |
| 329 | /* lr_acl_bytes number of variable sized ace's follows */ |
| 330 | } lr_acl_t; |
| 331 | |
| 332 | /* |
| 333 | * ZIL structure definitions, interface function prototype and globals. |
| 334 | */ |
| 335 | |
| 336 | /* |
| 337 | * Writes are handled in three different ways: |
| 338 | * |
| 339 | * WR_INDIRECT: |
| 340 | * In this mode, if we need to commit the write later, then the block |
| 341 | * is immediately written into the file system (using dmu_sync), |
| 342 | * and a pointer to the block is put into the log record. |
| 343 | * When the txg commits the block is linked in. |
| 344 | * This saves additionally writing the data into the log record. |
| 345 | * There are a few requirements for this to occur: |
| 346 | * - write is greater than zfs/zvol_immediate_write_sz |
| 347 | * - not using slogs (as slogs are assumed to always be faster |
| 348 | * than writing into the main pool) |
| 349 | * - the write occupies only one block |
| 350 | * WR_COPIED: |
| 351 | * If we know we'll immediately be committing the |
| 352 | * transaction (FSYNC or FDSYNC), the we allocate a larger |
| 353 | * log record here for the data and copy the data in. |
| 354 | * WR_NEED_COPY: |
| 355 | * Otherwise we don't allocate a buffer, and *if* we need to |
| 356 | * flush the write later then a buffer is allocated and |
| 357 | * we retrieve the data using the dmu. |
| 358 | */ |
| 359 | typedef enum { |
| 360 | WR_INDIRECT, /* indirect - a large write (dmu_sync() data */ |
| 361 | /* and put blkptr in log, rather than actual data) */ |
| 362 | WR_COPIED, /* immediate - data is copied into lr_write_t */ |
| 363 | WR_NEED_COPY, /* immediate - data needs to be copied if pushed */ |
| 364 | WR_NUM_STATES /* number of states */ |
| 365 | } itx_wr_state_t; |
| 366 | |
| 367 | typedef struct itx { |
| 368 | list_node_t itx_node; /* linkage on zl_itx_list */ |
| 369 | void *itx_private; /* type-specific opaque data */ |
| 370 | itx_wr_state_t itx_wr_state; /* write state */ |
| 371 | uint8_t itx_sync; /* synchronous transaction */ |
| 372 | uint64_t itx_oid; /* object id */ |
| 373 | lr_t itx_lr; /* common part of log record */ |
| 374 | /* followed by type-specific part of lr_xx_t and its immediate data */ |
| 375 | } itx_t; |
| 376 | |
| 377 | typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, |
| 378 | uint64_t txg); |
| 379 | typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, |
| 380 | uint64_t txg); |
| 381 | typedef int zil_replay_func_t(); |
| 382 | typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio); |
| 383 | |
| 384 | extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, |
| 385 | zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg); |
| 386 | |
| 387 | extern void zil_init(void); |
| 388 | extern void zil_fini(void); |
| 389 | |
| 390 | extern zilog_t *zil_alloc(objset_t *os, zil_header_t *zh_phys); |
| 391 | extern void zil_free(zilog_t *zilog); |
| 392 | |
| 393 | extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data); |
| 394 | extern void zil_close(zilog_t *zilog); |
| 395 | |
| 396 | extern void zil_replay(objset_t *os, void *arg, |
| 397 | zil_replay_func_t *replay_func[TX_MAX_TYPE]); |
| 398 | extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx); |
| 399 | extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); |
| 400 | extern void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx); |
| 401 | extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx); |
| 402 | |
| 403 | extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize); |
| 404 | extern void zil_itx_destroy(itx_t *itx); |
| 405 | extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); |
| 406 | |
| 407 | extern void zil_async_to_sync(zilog_t *zilog, uint64_t oid); |
| 408 | extern void zil_commit(zilog_t *zilog, uint64_t oid); |
| 409 | |
| 410 | extern int zil_vdev_offline(const char *osname, void *txarg); |
| 411 | extern int zil_claim(struct dsl_pool *dp, |
| 412 | struct dsl_dataset *ds, void *txarg); |
| 413 | extern int zil_check_log_chain(struct dsl_pool *dp, |
| 414 | struct dsl_dataset *ds, void *tx); |
| 415 | extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); |
| 416 | extern void zil_clean(zilog_t *zilog, uint64_t synced_txg); |
| 417 | |
| 418 | extern int zil_suspend(const char *osname, void **cookiep); |
| 419 | extern void zil_resume(void *cookie); |
| 420 | |
| 421 | extern void zil_add_block(zilog_t *zilog, const blkptr_t *bp); |
| 422 | extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp); |
| 423 | |
| 424 | extern void zil_set_sync(zilog_t *zilog, uint64_t syncval); |
| 425 | |
| 426 | extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval); |
| 427 | |
| 428 | extern int zil_replay_disable; |
| 429 | |
| 430 | #ifdef __cplusplus |
| 431 | } |
| 432 | #endif |
| 433 | |
| 434 | #endif /* _SYS_ZIL_H */ |
| 435 | |