Upload
others
View
8
Download
0
Embed Size (px)
Citation preview
10-P4:LayeredBlock-StructuredFileSystem
SlidesoriginallybyProf.vanRenesseCurrentversionbyAnneBracy
1
Intro• Underneathanyfilesystem,databasesystem,etc. ismoreblockstores
• Blockstoreabstractiondoesn’tdealwithfilenaming
2
File System API and Performance
Device Access
ApplicationLibrary
File SystemBlock Cache
Block Device InterfaceDevice Driver
Memory-Mapped I/O,DMA, Interrupts
Physical Device
BlockStore
BlockStoreAbstraction• Providesadisk-likeinterface:– asequenceofblocksnumbered0,1,…
(typicallyafewkilobytes)– youcanreadorwrite1blockatatime
10-P4hasyouworkwithmultipleversions/instantiationsofthis
abstraction.
3
BlockStoreBenefits
• Performance:– Cachesrecentlyreadblocks– Buffersrecentlywrittenblocks(tobewrittenlater)
• Synchronization:– allrequestsforagivenblockgothroughblockcache– Foreachentry,OSincludesinformationto:
• preventaprocessfromreadingblockwhileanotherwrites• ensurethatagivenblockisonlyfetchedfromstoragedeviceonce,evenifitissimultaneouslyreadbymanyprocesses
4
Headsupaboutthecode!Thisentirecodebaseiswhathappenswhenyouwantobjectorientedprogramming,butyouonlyhaveC.
PutonyourC++/JavaGoggles!
block_if (ablockinterface)isessentiallyanabstractclass
5
Contentsofblock_if.h#define BLOCK_SIZE 512 // # bytes in a block
typedef unsigned int block_no; // index of a block
struct block { char bytes[BLOCK_SIZE]; };typedef struct block block_t;
typedef struct block_if *block_if;
struct block_if {void *state;int (*nblocks)(block_if bif);int (*read)(block_if bif, block_no offset, block_t *block);int (*write)(block_if bif, block_no offset, block_t *block);int (*setsize)(block_if bif, block_no size);void (*destroy)(block_if bif);
};
6
ß poorman’sclass
Noneofthisisdata!Alltypedefs!
ß pointertotheinterface
block_if:BlockStoreInterface• xxx_init(…)à block_if– Name&signaturevaries,setsupallthefn pointers
• nblocks()à integer– returnssizeoftheblockstorein#blocks
• read(blocknumber)à block– returnsthecontentsofthegivenblocknumber
• write(blocknumber,block)– writestheblockcontentsatthegivenblocknumber
• setsize(nblocks)– setsthesizeoftheblockstore
• destroy()– freeseverythingassociatedwiththisblockstore
7
ß “constructor”
ß “destructor”
Simpleblockstores• disk:simulateddiskstoredonaLinuxfile
– block_if bif =disk_init(char*filename,intnblocks)(couldalsouserealdiskusing/dev/*diskdevices)
• ramdisk:asimulateddiskinmemory– block_if bif =ramdisk_init(block_t *blocks,nblocks)
• Fastbutvolatile
8
SampleProgram#include ...#include “block_if.h”
int main(){block_if disk = disk_init(“disk.dev”, 1024);block_t block;strcpy(block.bytes, “Hello World”);(*disk->write)(disk, 0, &block);(*disk->destroy)(disk);return 0;
}
gcc -g block_if.c sample.cgdb then check out disk.dev
9
BlockStorescanbeLayered!
Eachlayerpresentsablockstoreabstraction
CACHEDISK
STATDISK
DISK
block_if
keepsacacheofrecentlyusedblocks
keepstrackof#readsand#writesforstatistics
keepsblocksinaLinuxfile
10
Examplecodewithlayers#define CACHE_SIZE 10 // #blocks in cache
block_t cache[CACHE_SIZE];
int main(){block_if disk = disk_init(“disk.dev”, 1024);block_if sdisk = statdisk_init(disk);block_if cdisk = cachedisk_init(sdisk, cache, CACHE_SIZE);
block_t block;strcpy(block.bytes, “Hello World”);(*cdisk->write)(cdisk, 0, &block);(*cdisk->destroy)(cdisk);(*sdisk->destroy)(sdisk);(*disk->destroy)(disk);
return 0;}
gcc -g block_if.c statdisk.c cachedisk.c layer.c11
CACHEDISK
STATDISK
DISK
ExampleLayersblock_if clockdisk_init(block_if below,
block_t *blocks, block_no nblocks);// implements CLOCK cache allocation / eviction
block_if statdisk_init(block_if below);// counts all reads and writes
block_if debugdisk_init(block_if below, char *descr);// prints all reads and writes
block_if checkdisk_init(block_if below);// checks that what’s read is what was written
12
Howtowritealayerstruct statdisk_state {
block_if below; // block store belowunsigned int nread, nwrite; // stats
};
block_if statdisk_init(block_if below){struct statdisk_state *sds = calloc(1, sizeof(*sds));sds->below = below;
block_if bi = calloc(1, sizeof(*bi));bi->state = sds;bi->nblocks = statdisk_nblocks;bi->setsize = statdisk_setsize;bi->read = statdisk_read;bi->write = statdisk_write;bi->destroy = statdisk_destroy;return bi;
} 13
statdisk implementation,cont’dint statdisk_read(block_if bi, block_no offset, block_t *block){
struct statdisk_state *sds = bi->state;sds->nread++;return (*sds->below->read)(sds->below, offset, block);
}
int statdisk_write(block_if bi, block_no offset, block_t *block){struct statdisk_state *sds = bi->state;sds->nwrite++;return (*sds->below->write)(sds->below, offset, block);
}
void statdisk_destroy(block_if bi){free(bi->state);free(bi);
}all 3 functions declared static 14
Whydon’twedestroythebelow?
SharingaBlockStore• Onecouldcreatemultiplepartitions,oneforeachfile,butthathasverysimilarproblemstopartitioningphysicalmemoryamongprocesses
• Youwantsomethingsimilartopaging–moreefficientandflexiblesharing– techniquesareverysimilar!
Solution: FileSystems!15
File#1 File#2 File#3
Treedisk• Afilesystem,similartoUnixfilesystems(thisThursday)• InitializedtosupportNvirtualblockstores(AKAfiles)• Underlyingblockstore(below)partitionedinto3sections:1. Superblock: block#02. Fixednumberofi-nodeblocks: startsatblock#1– FunctionofN(enoughtostoreNi-nodes)
3. Remainingblocks: startsafteri-nodeblocks– datablocks,freeblocks,indirectblocks,freelist blocks
16
blocknumber 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
blocks:
Remainingblocksi-nodeblocks
superblock
TypesofBlocksinTreedisk
• Superblock:the0th blockbelow• Freelistblock:listofallunusedblocksbelow• I-nodeblock: listofinodes• Indirblock: listofblocks• Datablock: justdata
17
union treedisk_block {block_t datablock;struct treedisk_superblock superblock;struct treedisk_inodeblock inodeblock;struct treedisk_freelistblock freelistblock;struct treedisk_indirblock indirblock;
};
treedisk Superblock
// one per underlying block storestruct treedisk_superblock {
block_no n_inodeblocks; block_no free_list; // 1st block on free list
// 0 means no free blocks};
18
blocknumber 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
blocks:
remainingblocksinode blockssuperblock
Notice:therearenopointers.Everythingisablocknumber.
n_inodeblocks 4free_list ?(some green box)
treedisk FreeList
struct treedisk_freelistblock {block_no refs[REFS_PER_BLOCK];
};
refs[0]:#ofanotherfreelistblock or0ifendoflist
refs[i]:#offreeblockfori>1,0ifslotempty
19
blocknumber 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
blocks: 413
remainingblocksinode blockssuperblock
0678
5101112
914150Suppose REFS_PER_BLOCK = 4
treedisk freelist
n_inodeblocks #
free_listsuperblock:
0 0 0freelist block
0freelist block
freeblock
freeblockfreeblock
freeblock
20
treedisk I-nodeblock
21
blocknumber 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
blocks:
remainingblocksinode blockssuperblock
struct treedisk_inodeblock {struct treedisk_inode inodes[INODES_PER_BLOCK];
};
struct treedisk_inode {block_no nblocks; // # blocks in virtual block storeblock_no root; // block # of root node of tree (or 0)
};
11500
inode[0]
inode[1]
91400
SupposeREFS_PER_BLOCK = 4
Whatifthefileisbiggerthan1block?
treedisk Indirectblock
22
blocknumber 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
blocks:
remainingblocksinode blockssuperblock
struct treedisk_indirblock {block_no refs[REFS_PER_BLOCK];
};
115314
Suppose INODES_PER_BLOCK = 2
inode[0]
inode[1]
nblocksroot
nblocksroot
1312110
virtualblockstore:3blocks
nblocks 3
rooti-node:
indirectblock
datablock
datablock
datablock
23Whatifthefileisbiggerthan3 blocks?
treedisk virtualblockstore
nblocks ####
rooti-node: (double)indirectblock
indirectblock indirectblock
datablock
datablockdatablock
24HowdoIknowifthisisdataorablocknumber?
treedisk virtualblockstore
• alldatablocksatbottomlevel• #levels:ceil(logRPB(#blocks))+1
RPB=REFS_PER_BLOCK
• Forexample,ifrpb =16:#blocks #levels
0 0
1 1
2- 16 2
17- 256 3
257- 4096 4
REFS_PER_BLOCKmorecommonlyatleast128orso 25
virtualblockstore:withhole
nblocks 3
rooti-node: indirectblock
datablock
datablock
• Holeappearsasavirtualblockfilledwithnullbytes• pointertoindirectblockcanbe0too• virtualblockstorecanbemuchlargerthanthe“physical”blockstoreunderneath!
26
0
Puttingitalltogether
27
blocknumber 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
blocks: 413
remainingblocksinodeblocks
superblock
0678
51000
1312110
115314
inode[0]
inode[1]
nblocksroot
nblocksroot
Ashort-livedtreedisk filesystem#define DISK_SIZE 1024#define MAX_INODES 128
int main(){block_if disk = disk_init(“disk.dev”, DISK_SIZE);
treedisk_create(disk, MAX_INODES);
treedisk_check(disk); // optional: check integrity of file system
(*disk->destroy)(cdisk);
return 0;}
28
Examplecodewithtreediskblock_t cache[CACHE_SIZE];
int main(){block_if disk = disk_init(“disk.dev”, 1024);block_if cdisk = cachedisk_init(disk, cache, CACHE_SIZE);block_if disk0 = treedisk_init(cdisk, 0);block_if disk1 = treedisk_init(cdisk, 1);
block_t block;(*disk0->read)(disk0, 4, &block);(*disk1->read)(disk1, 4, &block);
(*disk0->destroy)(disk0);(*disk1->destroy)(disk1);(*cdisk->destroy)(cdisk);(*disk->destroy)(cdisk);
return 0;} 29
Layeringontopoftreedisk
TREEDISK
CACHEDISK
DISK
inode 0 inode 1 inode …
block_if treedisk_init(block_if below,unsigned int inode_no);
TREEDISK TREEDISK
30
...
...
traceutility
TREEDISK
CHECKDISK
STATDISK
CHECKDISK CHECKDISK CHECKDISK
TREEDISK TREEDISK
TRACEDISK
RAMDISK31
CACHEDISK
...
...
tracedisk• diskandramdisk arebottom-levelblockstore• tracedisk isatop-levelblockstore– or“application-level”ifyouwill– youcan’tlayerontopofit
block_if tracedisk_init(block_if below,char *trace, //tracefilenameunsigned int n_inodes);
32
ExampletracefileW:0:0 //writeinode 0,block0N:0:1 //checksifinode 0isofsize1W:1:1 //writeinode 1,block1N:1:2 //checksifinode 1isofsize2R:1:1 //readinode 1,block1S:1:0 //setsizeofinode 1to0N:1:0 //checksifinode 0isofsize0
ifNfails, prints “!!CHKSIZE ..”
33
CompilingandRunning• run“make”inthereleasedirectory– thisgeneratesanexecutablecalled“trace”
• run“./trace”– thisreadstracefile“trace.txt”– youcanpassanothertracefileasargument
• ./tracemyowntracefile
34
Outputtobeexpected$ makecc -Wall -c -o trace.o trace.c. . .cc -Wall -c -o treedisk_chk.o treedisk_chk.ccc -o trace trace.o block_if.o cachedisk.o checkdisk.o clockdisk.odebugdisk.o disk.o ramdisk.o statdisk.o tracedisk.o treedisk.otreedisk_chk.o$ ./traceblocksize: 512refs/block: 128!!TDERR: setsize not yet supported!!ERROR: tracedisk_run: setsize(1, 0) failed!!CHKSIZE 10: nblocks 1: 0 != 2!$STAT: #nnblocks: 5 ß bug!!$STAT: #nsetsize: 0!$STAT: #nread: 32!$STAT: #nwrite: 20 35
TraceW:0:0N:0:1W:0:1N:0:2W:1:0N:1:1W:1:1N:1:2S:1:0N:1:0
Cmd:inode:block
10-P4:Part1/3Implementtreedisk_setsize(0)– currentlyitgeneratesanerror– whatyouneedtodo:
• iteratethroughalltheblocksintheinode• putthemonthefreelist
Usefulfunctions:• treedisk_get_snapshot
36
10-P4:Part2/3Implementcachedisk– currentlyitdoesn’tactuallydoanything– whatyouneedtodo:
• pickacachingalgorithm:LRU,MFU,ordesignyourown– gowild!
• implementitwithincachedisk.c• write-throughcache!!
– clockdisk.c isprovided• itimplementstheCLOCKalgorithm• youcanimplementarefinedversionofCLOCK,likeatwo-handedclock
• consultthewebforcachingalgorithms!
37
10-P4:Part3/3Implementyourowntracefile– read,write,setsize operations– atmost10,000commands– atmost128inodes– atmost1<<27blocksize– trytomakeitreallyhardforacachinglayertobeeffective• e.g.,randomreads/writes
38
Whattosubmit• treedisk.c //withtreedisk_setsize()• cachedisk.c• trace.txt
39
TheBigRedCachingContest!!!• Wewillruneverybody’straceagainsteverybody’streedisk andcachedisk
• Wewillrunthisontopofastatdisk• Wewillcountthetotalnumberofreadoperations
• Thewinneriswhomeverendsupdoingthefewestreadoperationstotheunderlyingdisk
• Doesnotcounttowardsgradeof10-P4,butyoumaywinfameandglory
40