001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.kahadb.page;
018
019 import java.io.ByteArrayInputStream;
020 import java.io.ByteArrayOutputStream;
021 import java.io.DataInputStream;
022 import java.io.DataOutputStream;
023 import java.io.File;
024 import java.io.FileInputStream;
025 import java.io.FileOutputStream;
026 import java.io.IOException;
027 import java.io.InterruptedIOException;
028 import java.io.RandomAccessFile;
029 import java.util.ArrayList;
030 import java.util.Arrays;
031 import java.util.Collection;
032 import java.util.Iterator;
033 import java.util.LinkedHashMap;
034 import java.util.Map;
035 import java.util.Properties;
036 import java.util.TreeMap;
037 import java.util.Map.Entry;
038 import java.util.concurrent.CountDownLatch;
039 import java.util.concurrent.atomic.AtomicBoolean;
040 import java.util.concurrent.atomic.AtomicLong;
041 import java.util.zip.Adler32;
042 import java.util.zip.Checksum;
043
044 import org.apache.commons.logging.Log;
045 import org.apache.commons.logging.LogFactory;
046 import org.apache.kahadb.util.DataByteArrayOutputStream;
047 import org.apache.kahadb.util.IOHelper;
048 import org.apache.kahadb.util.IntrospectionSupport;
049 import org.apache.kahadb.util.LRUCache;
050 import org.apache.kahadb.util.Sequence;
051 import org.apache.kahadb.util.SequenceSet;
052
053 /**
054 * A PageFile provides you random access to fixed sized disk pages. This object is not thread safe and therefore access to it should
055 * be externally synchronized.
056 *
057 * The file has 3 parts:
058 * Metadata Space: 4k : Reserved metadata area. Used to store persistent config about the file.
059 * Recovery Buffer Space: Page Size * 1000 : This is a redo log used to prevent partial page writes from making the file inconsistent
060 * Page Space: The pages in the page file.
061 *
062 * @version $Revision: 814649 $
063 */
064 public class PageFile {
065
066 private static final String PAGEFILE_SUFFIX = ".data";
067 private static final String RECOVERY_FILE_SUFFIX = ".redo";
068 private static final String FREE_FILE_SUFFIX = ".free";
069
070 // 4k Default page size.
071 public static final int DEFAULT_PAGE_SIZE = Integer.parseInt(System.getProperty("defaultPageSize", ""+1024*4));
072 public static final int DEFAULT_WRITE_BATCH_SIZE = Integer.parseInt(System.getProperty("defaultWriteBatchSize", ""+1000));
073 private static final int RECOVERY_FILE_HEADER_SIZE=1024*4;
074 private static final int PAGE_FILE_HEADER_SIZE=1024*4;
075
076 // Recovery header is (long offset)
077 private static final Log LOG = LogFactory.getLog(PageFile.class);
078
079 // A PageFile will use a couple of files in this directory
080 private File directory;
081 // And the file names in that directory will be based on this name.
082 private final String name;
083
084 // File handle used for reading pages..
085 private RandomAccessFile readFile;
086 // File handle used for writing pages..
087 private RandomAccessFile writeFile;
088 // File handle used for writing pages..
089 private RandomAccessFile recoveryFile;
090
091 // The size of pages
092 private int pageSize = DEFAULT_PAGE_SIZE;
093
094 // The minimum number of space allocated to the recovery file in number of pages.
095 private int recoveryFileMinPageCount = 1000;
096 // The max size that we let the recovery file grow to.. ma exceed the max, but the file will get resize
097 // to this max size as soon as possible.
098 private int recoveryFileMaxPageCount = 10000;
099 // The number of pages in the current recovery buffer
100 private int recoveryPageCount;
101
102 private AtomicBoolean loaded = new AtomicBoolean();
103 // The number of pages we are aiming to write every time we
104 // write to disk.
105 int writeBatchSize = DEFAULT_WRITE_BATCH_SIZE;
106
107 // We keep a cache of pages recently used?
108 private LRUCache<Long, Page> pageCache;
109 // The cache of recently used pages.
110 private boolean enablePageCaching=true;
111 // How many pages will we keep in the cache?
112 private int pageCacheSize = 100;
113
114 // Should first log the page write to the recovery buffer? Avoids partial
115 // page write failures..
116 private boolean enableRecoveryFile=true;
117 // Will we sync writes to disk. Ensures that data will not be lost after a checkpoint()
118 private boolean enableDiskSyncs=true;
119 // Will writes be done in an async thread?
120 private boolean enabledWriteThread=false;
121
122 // These are used if enableAsyncWrites==true
123 private AtomicBoolean stopWriter = new AtomicBoolean();
124 private Thread writerThread;
125 private CountDownLatch checkpointLatch;
126
127 // Keeps track of writes that are being written to disk.
128 private TreeMap<Long, PageWrite> writes=new TreeMap<Long, PageWrite>();
129
130 // Keeps track of free pages.
131 private final AtomicLong nextFreePageId = new AtomicLong();
132 private SequenceSet freeList = new SequenceSet();
133
134 private AtomicLong nextTxid = new AtomicLong();
135
136 // Persistent settings stored in the page file.
137 private MetaData metaData;
138
139 /**
140 * Use to keep track of updated pages which have not yet been committed.
141 */
142 static class PageWrite {
143 Page page;
144 byte[] current;
145 byte[] diskBound;
146
147 public PageWrite(Page page, byte[] data) {
148 this.page=page;
149 current=data;
150 }
151
152 public void setCurrent(Page page, byte[] data) {
153 this.page=page;
154 current=data;
155 }
156
157 @Override
158 public String toString() {
159 return "[PageWrite:"+page.getPageId()+"]";
160 }
161
162 @SuppressWarnings("unchecked")
163 public Page getPage() {
164 return page;
165 }
166
167 void begin() {
168 diskBound = current;
169 current = null;
170 }
171
172 /**
173 * @return true if there is no pending writes to do.
174 */
175 boolean done() {
176 diskBound=null;
177 return current == null;
178 }
179
180 }
181
182 /**
183 * The MetaData object hold the persistent data associated with a PageFile object.
184 */
185 public static class MetaData {
186
187 String fileType;
188 String fileTypeVersion;
189
190 long metaDataTxId=-1;
191 int pageSize;
192 boolean cleanShutdown;
193 long lastTxId;
194 long freePages;
195
196 public String getFileType() {
197 return fileType;
198 }
199 public void setFileType(String fileType) {
200 this.fileType = fileType;
201 }
202 public String getFileTypeVersion() {
203 return fileTypeVersion;
204 }
205 public void setFileTypeVersion(String version) {
206 this.fileTypeVersion = version;
207 }
208 public long getMetaDataTxId() {
209 return metaDataTxId;
210 }
211 public void setMetaDataTxId(long metaDataTxId) {
212 this.metaDataTxId = metaDataTxId;
213 }
214 public int getPageSize() {
215 return pageSize;
216 }
217 public void setPageSize(int pageSize) {
218 this.pageSize = pageSize;
219 }
220 public boolean isCleanShutdown() {
221 return cleanShutdown;
222 }
223 public void setCleanShutdown(boolean cleanShutdown) {
224 this.cleanShutdown = cleanShutdown;
225 }
226 public long getLastTxId() {
227 return lastTxId;
228 }
229 public void setLastTxId(long lastTxId) {
230 this.lastTxId = lastTxId;
231 }
232 public long getFreePages() {
233 return freePages;
234 }
235 public void setFreePages(long value) {
236 this.freePages = value;
237 }
238 }
239
240 public Transaction tx() {
241 assertLoaded();
242 return new Transaction(this);
243 }
244
245 /**
246 * Creates a PageFile in the specified directory who's data files are named by name.
247 *
248 * @param directory
249 * @param name
250 */
251 public PageFile(File directory, String name) {
252 this.directory = directory;
253 this.name = name;
254 }
255
256 /**
257 * Deletes the files used by the PageFile object. This method can only be used when this object is not loaded.
258 *
259 * @throws IOException
260 * if the files cannot be deleted.
261 * @throws IllegalStateException
262 * if this PageFile is loaded
263 */
264 public void delete() throws IOException {
265 if( loaded.get() ) {
266 throw new IllegalStateException("Cannot delete page file data when the page file is loaded");
267 }
268 delete(getMainPageFile());
269 delete(getFreeFile());
270 delete(getRecoveryFile());
271 }
272
273 /**
274 * @param file
275 * @throws IOException
276 */
277 private void delete(File file) throws IOException {
278 if( file.exists() ) {
279 if( !file.delete() ) {
280 throw new IOException("Could not delete: "+file.getPath());
281 }
282 }
283 }
284
285 /**
286 * Loads the page file so that it can be accessed for read/write purposes. This allocates OS resources. If this is the
287 * first time the page file is loaded, then this creates the page file in the file system.
288 *
289 * @throws IOException
290 * If the page file cannot be loaded. This could be cause the existing page file is corrupt is a bad version or if
291 * there was a disk error.
292 * @throws IllegalStateException
293 * If the page file was already loaded.
294 */
295 public void load() throws IOException, IllegalStateException {
296 if (loaded.compareAndSet(false, true)) {
297
298 if( enablePageCaching ) {
299 pageCache = new LRUCache<Long, Page>(pageCacheSize, pageCacheSize, 0.75f, true);
300 }
301
302 File file = getMainPageFile();
303 IOHelper.mkdirs(file.getParentFile());
304 writeFile = new RandomAccessFile(file, "rw");
305 readFile = new RandomAccessFile(file, "r");
306
307 if (readFile.length() > 0) {
308 // Load the page size setting cause that can't change once the file is created.
309 loadMetaData();
310 pageSize = metaData.getPageSize();
311 } else {
312 // Store the page size setting cause that can't change once the file is created.
313 metaData = new MetaData();
314 metaData.setFileType(PageFile.class.getName());
315 metaData.setFileTypeVersion("1");
316 metaData.setPageSize(getPageSize());
317 metaData.setCleanShutdown(true);
318 metaData.setFreePages(-1);
319 metaData.setLastTxId(0);
320 storeMetaData();
321 }
322
323 if( enableRecoveryFile ) {
324 recoveryFile = new RandomAccessFile(getRecoveryFile(), "rw");
325 }
326
327 if( metaData.isCleanShutdown() ) {
328 nextTxid.set(metaData.getLastTxId()+1);
329 if( metaData.getFreePages()>0 ) {
330 loadFreeList();
331 }
332 } else {
333 LOG.debug("Recovering page file...");
334 nextTxid.set(redoRecoveryUpdates());
335
336 // Scan all to find the free pages.
337 freeList = new SequenceSet();
338 for (Iterator i = tx().iterator(true); i.hasNext();) {
339 Page page = (Page)i.next();
340 if( page.getType() == Page.PAGE_FREE_TYPE ) {
341 freeList.add(page.getPageId());
342 }
343 }
344
345 }
346
347 metaData.setCleanShutdown(false);
348 storeMetaData();
349 getFreeFile().delete();
350
351 if( writeFile.length() < PAGE_FILE_HEADER_SIZE) {
352 writeFile.setLength(PAGE_FILE_HEADER_SIZE);
353 }
354 nextFreePageId.set((writeFile.length()-PAGE_FILE_HEADER_SIZE)/pageSize);
355 startWriter();
356
357 } else {
358 throw new IllegalStateException("Cannot load the page file when it is allready loaded.");
359 }
360 }
361
362
363 /**
364 * Unloads a previously loaded PageFile. This deallocates OS related resources like file handles.
365 * once unloaded, you can no longer use the page file to read or write Pages.
366 *
367 * @throws IOException
368 * if there was a disk error occurred while closing the down the page file.
369 * @throws IllegalStateException
370 * if the PageFile is not loaded
371 */
372 public void unload() throws IOException {
373 if (loaded.compareAndSet(true, false)) {
374 flush();
375 try {
376 stopWriter();
377 } catch (InterruptedException e) {
378 throw new InterruptedIOException();
379 }
380
381 if( freeList.isEmpty() ) {
382 metaData.setFreePages(0);
383 } else {
384 storeFreeList();
385 metaData.setFreePages(freeList.size());
386 }
387
388 metaData.setLastTxId( nextTxid.get()-1 );
389 metaData.setCleanShutdown(true);
390 storeMetaData();
391
392 if (readFile != null) {
393 readFile.close();
394 readFile = null;
395 writeFile.close();
396 writeFile=null;
397 if( enableRecoveryFile ) {
398 recoveryFile.close();
399 recoveryFile=null;
400 }
401 freeList.clear();
402 if( pageCache!=null ) {
403 pageCache=null;
404 }
405 synchronized(writes) {
406 writes.clear();
407 }
408 }
409 } else {
410 throw new IllegalStateException("Cannot unload the page file when it is not loaded");
411 }
412 }
413
414 public boolean isLoaded() {
415 return loaded.get();
416 }
417
418 /**
419 * Flush and sync all write buffers to disk.
420 *
421 * @throws IOException
422 * If an disk error occurred.
423 */
424 public void flush() throws IOException {
425
426 if( enabledWriteThread && stopWriter.get() ) {
427 throw new IOException("Page file already stopped: checkpointing is not allowed");
428 }
429
430 // Setup a latch that gets notified when all buffered writes hits the disk.
431 CountDownLatch checkpointLatch;
432 synchronized( writes ) {
433 if( writes.isEmpty()) {
434 return;
435 }
436 if( enabledWriteThread ) {
437 if( this.checkpointLatch == null ) {
438 this.checkpointLatch = new CountDownLatch(1);
439 }
440 checkpointLatch = this.checkpointLatch;
441 writes.notify();
442 } else {
443 writeBatch();
444 return;
445 }
446 }
447 try {
448 int size = writes.size();
449 long start = System.currentTimeMillis();
450 checkpointLatch.await();
451 long end = System.currentTimeMillis();
452 if( end-start > 100 ) {
453 LOG.warn("KahaDB PageFile flush: " + size + " queued writes, latch wait took "+(end-start));
454 }
455 } catch (InterruptedException e) {
456 throw new InterruptedIOException();
457 }
458 }
459
460
461 public String toString() {
462 return "Page File: "+getMainPageFile();
463 }
464
465 ///////////////////////////////////////////////////////////////////
466 // Private Implementation Methods
467 ///////////////////////////////////////////////////////////////////
468 private File getMainPageFile() {
469 return new File(directory, IOHelper.toFileSystemSafeName(name)+PAGEFILE_SUFFIX);
470 }
471
472 public File getFreeFile() {
473 return new File(directory, IOHelper.toFileSystemSafeName(name)+FREE_FILE_SUFFIX);
474 }
475
476 public File getRecoveryFile() {
477 return new File(directory, IOHelper.toFileSystemSafeName(name)+RECOVERY_FILE_SUFFIX);
478 }
479
480 private long toOffset(long pageId) {
481 return PAGE_FILE_HEADER_SIZE+(pageId*pageSize);
482 }
483
484 private void loadMetaData() throws IOException {
485
486 ByteArrayInputStream is;
487 MetaData v1 = new MetaData();
488 MetaData v2 = new MetaData();
489 try {
490 Properties p = new Properties();
491 byte[] d = new byte[PAGE_FILE_HEADER_SIZE/2];
492 readFile.seek(0);
493 readFile.readFully(d);
494 is = new ByteArrayInputStream(d);
495 p.load(is);
496 IntrospectionSupport.setProperties(v1, p);
497 } catch (IOException e) {
498 v1 = null;
499 }
500
501 try {
502 Properties p = new Properties();
503 byte[] d = new byte[PAGE_FILE_HEADER_SIZE/2];
504 readFile.seek(PAGE_FILE_HEADER_SIZE/2);
505 readFile.readFully(d);
506 is = new ByteArrayInputStream(d);
507 p.load(is);
508 IntrospectionSupport.setProperties(v2, p);
509 } catch (IOException e) {
510 v2 = null;
511 }
512
513 if( v1==null && v2==null ) {
514 throw new IOException("Could not load page file meta data");
515 }
516
517 if( v1 == null || v1.metaDataTxId<0 ) {
518 metaData = v2;
519 } else if( v2==null || v1.metaDataTxId<0 ) {
520 metaData = v1;
521 } else if( v1.metaDataTxId==v2.metaDataTxId ) {
522 metaData = v1; // use the first since the 2nd could be a partial..
523 } else {
524 metaData = v2; // use the second cause the first is probably a partial.
525 }
526 }
527
528 private void storeMetaData() throws IOException {
529 // Convert the metadata into a property format
530 metaData.metaDataTxId++;
531 Properties p = new Properties();
532 IntrospectionSupport.getProperties(metaData, p, null);
533
534 ByteArrayOutputStream os = new ByteArrayOutputStream(PAGE_FILE_HEADER_SIZE);
535 p.store(os, "");
536 if( os.size() > PAGE_FILE_HEADER_SIZE/2) {
537 throw new IOException("Configuation is to larger than: "+PAGE_FILE_HEADER_SIZE/2);
538 }
539 // Fill the rest with space...
540 byte[] filler = new byte[(PAGE_FILE_HEADER_SIZE/2)-os.size()];
541 Arrays.fill(filler, (byte)' ');
542 os.write(filler);
543 os.flush();
544
545 byte[] d = os.toByteArray();
546
547 // So we don't loose it.. write it 2 times...
548 writeFile.seek(0);
549 writeFile.write(d);
550 writeFile.getFD().sync();
551 writeFile.seek(PAGE_FILE_HEADER_SIZE/2);
552 writeFile.write(d);
553 writeFile.getFD().sync();
554 }
555
556 private void storeFreeList() throws IOException {
557 FileOutputStream os = new FileOutputStream(getFreeFile());
558 DataOutputStream dos = new DataOutputStream(os);
559 SequenceSet.Marshaller.INSTANCE.writePayload(freeList, dos);
560 dos.close();
561 }
562
563 private void loadFreeList() throws IOException {
564 freeList.clear();
565 FileInputStream is = new FileInputStream(getFreeFile());
566 DataInputStream dis = new DataInputStream(is);
567 freeList = SequenceSet.Marshaller.INSTANCE.readPayload(dis);
568 dis.close();
569 }
570
571 ///////////////////////////////////////////////////////////////////
572 // Property Accessors
573 ///////////////////////////////////////////////////////////////////
574
575 /**
576 * Is the recovery buffer used to double buffer page writes. Enabled by default.
577 *
578 * @return is the recovery buffer enabled.
579 */
580 public boolean isEnableRecoveryFile() {
581 return enableRecoveryFile;
582 }
583
584 /**
585 * Sets if the recovery buffer uses to double buffer page writes. Enabled by default. Disabling this
586 * may potentially cause partial page writes which can lead to page file corruption.
587 */
588 public void setEnableRecoveryFile(boolean doubleBuffer) {
589 assertNotLoaded();
590 this.enableRecoveryFile = doubleBuffer;
591 }
592
593 /**
594 * @return Are page writes synced to disk?
595 */
596 public boolean isEnableDiskSyncs() {
597 return enableDiskSyncs;
598 }
599
600 /**
601 * Allows you enable syncing writes to disk.
602 * @param syncWrites
603 */
604 public void setEnableDiskSyncs(boolean syncWrites) {
605 assertNotLoaded();
606 this.enableDiskSyncs = syncWrites;
607 }
608
609 /**
610 * @return the page size
611 */
612 public int getPageSize() {
613 return this.pageSize;
614 }
615
616 /**
617 * @return the amount of content data that a page can hold.
618 */
619 public int getPageContentSize() {
620 return this.pageSize-Page.PAGE_HEADER_SIZE;
621 }
622
623 /**
624 * Configures the page size used by the page file. By default it is 4k. Once a page file is created on disk,
625 * subsequent loads of that file will use the original pageSize. Once the PageFile is loaded, this setting
626 * can no longer be changed.
627 *
628 * @param pageSize the pageSize to set
629 * @throws IllegalStateException
630 * once the page file is loaded.
631 */
632 public void setPageSize(int pageSize) throws IllegalStateException {
633 assertNotLoaded();
634 this.pageSize = pageSize;
635 }
636
637 /**
638 * @return true if read page caching is enabled
639 */
640 public boolean isEnablePageCaching() {
641 return this.enablePageCaching;
642 }
643
644 /**
645 * @param allows you to enable read page caching
646 */
647 public void setEnablePageCaching(boolean enablePageCaching) {
648 assertNotLoaded();
649 this.enablePageCaching = enablePageCaching;
650 }
651
652 /**
653 * @return the maximum number of pages that will get stored in the read page cache.
654 */
655 public int getPageCacheSize() {
656 return this.pageCacheSize;
657 }
658
659 /**
660 * @param Sets the maximum number of pages that will get stored in the read page cache.
661 */
662 public void setPageCacheSize(int pageCacheSize) {
663 assertNotLoaded();
664 this.pageCacheSize = pageCacheSize;
665 }
666
667 public boolean isEnabledWriteThread() {
668 return enabledWriteThread;
669 }
670
671 public void setEnableWriteThread(boolean enableAsyncWrites) {
672 assertNotLoaded();
673 this.enabledWriteThread = enableAsyncWrites;
674 }
675
676 public long getDiskSize() throws IOException {
677 return toOffset(nextFreePageId.get());
678 }
679
680 /**
681 * @return the number of pages allocated in the PageFile
682 */
683 public long getPageCount() {
684 return nextFreePageId.get();
685 }
686
687 public int getRecoveryFileMinPageCount() {
688 return recoveryFileMinPageCount;
689 }
690
691 public void setRecoveryFileMinPageCount(int recoveryFileMinPageCount) {
692 assertNotLoaded();
693 this.recoveryFileMinPageCount = recoveryFileMinPageCount;
694 }
695
696 public int getRecoveryFileMaxPageCount() {
697 return recoveryFileMaxPageCount;
698 }
699
700 public void setRecoveryFileMaxPageCount(int recoveryFileMaxPageCount) {
701 assertNotLoaded();
702 this.recoveryFileMaxPageCount = recoveryFileMaxPageCount;
703 }
704
705 public int getWriteBatchSize() {
706 return writeBatchSize;
707 }
708
709 public void setWriteBatchSize(int writeBatchSize) {
710 assertNotLoaded();
711 this.writeBatchSize = writeBatchSize;
712 }
713
714 ///////////////////////////////////////////////////////////////////
715 // Package Protected Methods exposed to Transaction
716 ///////////////////////////////////////////////////////////////////
717
718 /**
719 * @throws IllegalStateException if the page file is not loaded.
720 */
721 void assertLoaded() throws IllegalStateException {
722 if( !loaded.get() ) {
723 throw new IllegalStateException("PageFile is not loaded");
724 }
725 }
726 void assertNotLoaded() throws IllegalStateException {
727 if( loaded.get() ) {
728 throw new IllegalStateException("PageFile is loaded");
729 }
730 }
731
732 /**
733 * Allocates a block of free pages that you can write data to.
734 *
735 * @param count the number of sequential pages to allocate
736 * @return the first page of the sequential set.
737 * @throws IOException
738 * If an disk error occurred.
739 * @throws IllegalStateException
740 * if the PageFile is not loaded
741 */
742 <T> Page<T> allocate(int count) throws IOException {
743 assertLoaded();
744 if (count <= 0) {
745 throw new IllegalArgumentException("The allocation count must be larger than zero");
746 }
747
748 Sequence seq = freeList.removeFirstSequence(count);
749
750 // We may need to create new free pages...
751 if (seq == null) {
752
753 Page<T> first = null;
754 int c = count;
755 while (c > 0) {
756 Page<T> page = new Page<T>(nextFreePageId.getAndIncrement());
757 page.makeFree(getNextWriteTransactionId());
758
759 if (first == null) {
760 first = page;
761 }
762
763 addToCache(page);
764 DataByteArrayOutputStream out = new DataByteArrayOutputStream(pageSize);
765 page.write(out);
766 write(page, out.getData());
767
768 // LOG.debug("allocate writing: "+page.getPageId());
769 c--;
770 }
771
772 return first;
773 }
774
775 Page<T> page = new Page<T>(seq.getFirst());
776 page.makeFree(0);
777 // LOG.debug("allocated: "+page.getPageId());
778 return page;
779 }
780
781 long getNextWriteTransactionId() {
782 return nextTxid.incrementAndGet();
783 }
784
785 void readPage(long pageId, byte[] data) throws IOException {
786 readFile.seek(toOffset(pageId));
787 readFile.readFully(data);
788 }
789
790 public void freePage(long pageId) {
791 freeList.add(pageId);
792 if( enablePageCaching ) {
793 pageCache.remove(pageId);
794 }
795 }
796
797 @SuppressWarnings("unchecked")
798 private <T> void write(Page<T> page, byte[] data) throws IOException {
799 final PageWrite write = new PageWrite(page, data);
800 Entry<Long, PageWrite> entry = new Entry<Long, PageWrite>(){
801 public Long getKey() {
802 return write.getPage().getPageId();
803 }
804 public PageWrite getValue() {
805 return write;
806 }
807 public PageWrite setValue(PageWrite value) {
808 return null;
809 }
810 };
811 Entry<Long, PageWrite>[] entries = new Map.Entry[]{entry};
812 write(Arrays.asList(entries));
813 }
814
815 void write(Collection<Map.Entry<Long, PageWrite>> updates) throws IOException {
816 synchronized( writes ) {
817 if( enabledWriteThread ) {
818 while( writes.size() >= writeBatchSize && !stopWriter.get() ) {
819 try {
820 writes.wait();
821 } catch (InterruptedException e) {
822 Thread.currentThread().interrupt();
823 throw new InterruptedIOException();
824 }
825 }
826 }
827
828 for (Map.Entry<Long, PageWrite> entry : updates) {
829 Long key = entry.getKey();
830 PageWrite value = entry.getValue();
831 PageWrite write = writes.get(key);
832 if( write==null ) {
833 writes.put(key, value);
834 } else {
835 write.setCurrent(value.page, value.current);
836 }
837 }
838
839 // Once we start approaching capacity, notify the writer to start writing
840 if( canStartWriteBatch() ) {
841 if( enabledWriteThread ) {
842 writes.notify();
843 } else {
844 writeBatch();
845 }
846 }
847 }
848 }
849
850 private boolean canStartWriteBatch() {
851 int capacityUsed = ((writes.size() * 100)/writeBatchSize);
852 if( enabledWriteThread ) {
853 // The constant 10 here controls how soon write batches start going to disk..
854 // would be nice to figure out how to auto tune that value. Make to small and
855 // we reduce through put because we are locking the write mutex too often doing writes
856 return capacityUsed >= 10 || checkpointLatch!=null;
857 } else {
858 return capacityUsed >= 80 || checkpointLatch!=null;
859 }
860 }
861
862 ///////////////////////////////////////////////////////////////////
863 // Cache Related operations
864 ///////////////////////////////////////////////////////////////////
865 @SuppressWarnings("unchecked")
866 <T> Page<T> getFromCache(long pageId) {
867 synchronized(writes) {
868 PageWrite pageWrite = writes.get(pageId);
869 if( pageWrite != null ) {
870 return pageWrite.page;
871 }
872 }
873
874 Page<T> result = null;
875 if (enablePageCaching) {
876 result = pageCache.get(pageId);
877 }
878 return result;
879 }
880
881 void addToCache(Page page) {
882 if (enablePageCaching) {
883 pageCache.put(page.getPageId(), page);
884 }
885 }
886
887 void removeFromCache(Page page) {
888 if (enablePageCaching) {
889 pageCache.remove(page.getPageId());
890 }
891 }
892
893 ///////////////////////////////////////////////////////////////////
894 // Internal Double write implementation follows...
895 ///////////////////////////////////////////////////////////////////
896 /**
897 *
898 */
899 private void pollWrites() {
900 try {
901 while( !stopWriter.get() ) {
902 // Wait for a notification...
903 synchronized( writes ) {
904 writes.notifyAll();
905
906 // If there is not enough to write, wait for a notification...
907 while( writes.isEmpty() && checkpointLatch==null && !stopWriter.get() ) {
908 writes.wait(100);
909 }
910
911 if( writes.isEmpty() ) {
912 releaseCheckpointWaiter();
913 }
914 }
915 writeBatch();
916 }
917 } catch (Throwable e) {
918 e.printStackTrace();
919 } finally {
920 releaseCheckpointWaiter();
921 }
922 }
923
924 /**
925 *
926 * @param timeout
927 * @param unit
928 * @return true if there are still pending writes to do.
929 * @throws InterruptedException
930 * @throws IOException
931 */
932 private void writeBatch() throws IOException {
933
934 CountDownLatch checkpointLatch;
935 ArrayList<PageWrite> batch;
936 synchronized( writes ) {
937 // If there is not enough to write, wait for a notification...
938
939 batch = new ArrayList<PageWrite>(writes.size());
940 // build a write batch from the current write cache.
941 for (PageWrite write : writes.values()) {
942 batch.add(write);
943 // Move the current write to the diskBound write, this lets folks update the
944 // page again without blocking for this write.
945 write.begin();
946 }
947
948 // Grab on to the existing checkpoint latch cause once we do this write we can
949 // release the folks that were waiting for those writes to hit disk.
950 checkpointLatch = this.checkpointLatch;
951 this.checkpointLatch=null;
952 }
953
954
955 if (enableRecoveryFile) {
956
957 // Using Adler-32 instead of CRC-32 because it's much faster and it's
958 // weakness for short messages with few hundred bytes is not a factor in this case since we know
959 // our write batches are going to much larger.
960 Checksum checksum = new Adler32();
961 for (PageWrite w : batch) {
962 checksum.update(w.diskBound, 0, pageSize);
963 }
964
965 // Can we shrink the recovery buffer??
966 if( recoveryPageCount > recoveryFileMaxPageCount ) {
967 int t = Math.max(recoveryFileMinPageCount, batch.size());
968 recoveryFile.setLength(recoveryFileSizeForPages(t));
969 }
970
971 // Record the page writes in the recovery buffer.
972 recoveryFile.seek(0);
973 // Store the next tx id...
974 recoveryFile.writeLong(nextTxid.get());
975 // Store the checksum for thw write batch so that on recovery we know if we have a consistent
976 // write batch on disk.
977 recoveryFile.writeLong(checksum.getValue());
978 // Write the # of pages that will follow
979 recoveryFile.writeInt(batch.size());
980
981
982 // Write the pages.
983 recoveryFile.seek(RECOVERY_FILE_HEADER_SIZE);
984 for (PageWrite w : batch) {
985 recoveryFile.writeLong(w.page.getPageId());
986 recoveryFile.write(w.diskBound, 0, pageSize);
987 }
988
989 if (enableDiskSyncs) {
990 // Sync to make sure recovery buffer writes land on disk..
991 recoveryFile.getFD().sync();
992 }
993
994 recoveryPageCount = batch.size();
995 }
996
997
998 for (PageWrite w : batch) {
999 writeFile.seek(toOffset(w.page.getPageId()));
1000 writeFile.write(w.diskBound, 0, pageSize);
1001 }
1002
1003 // Sync again
1004 if( enableDiskSyncs ) {
1005 writeFile.getFD().sync();
1006 }
1007
1008 synchronized( writes ) {
1009 for (PageWrite w : batch) {
1010 // If there are no more pending writes, then remove it from the write cache.
1011 if( w.done() ) {
1012 writes.remove(w.page.getPageId());
1013 }
1014 }
1015 }
1016
1017 if( checkpointLatch!=null ) {
1018 checkpointLatch.countDown();
1019 }
1020 }
1021
1022 private long recoveryFileSizeForPages(int pageCount) {
1023 return RECOVERY_FILE_HEADER_SIZE+((pageSize+8)*pageCount);
1024 }
1025
1026 private void releaseCheckpointWaiter() {
1027 if( checkpointLatch!=null ) {
1028 checkpointLatch.countDown();
1029 checkpointLatch=null;
1030 }
1031 }
1032
1033 /**
1034 * Inspects the recovery buffer and re-applies any
1035 * partially applied page writes.
1036 *
1037 * @return the next transaction id that can be used.
1038 * @throws IOException
1039 */
1040 private long redoRecoveryUpdates() throws IOException {
1041 if( !enableRecoveryFile ) {
1042 return 0;
1043 }
1044 recoveryPageCount=0;
1045
1046 // Are we initializing the recovery file?
1047 if( recoveryFile.length() == 0 ) {
1048 // Write an empty header..
1049 recoveryFile.write(new byte[RECOVERY_FILE_HEADER_SIZE]);
1050 // Preallocate the minium size for better performance.
1051 recoveryFile.setLength(recoveryFileSizeForPages(recoveryFileMinPageCount));
1052 return 0;
1053 }
1054
1055 // How many recovery pages do we have in the recovery buffer?
1056 recoveryFile.seek(0);
1057 long nextTxId = readFile.readLong();
1058 long expectedChecksum = readFile.readLong();
1059 int pageCounter = readFile.readInt();
1060
1061 recoveryFile.seek(RECOVERY_FILE_HEADER_SIZE);
1062 Checksum checksum = new Adler32();
1063 LinkedHashMap<Long, byte[]> batch = new LinkedHashMap<Long, byte[]>();
1064 try {
1065 for (int i = 0; i < pageCounter; i++) {
1066 long offset = recoveryFile.readLong();
1067 byte []data = new byte[pageSize];
1068 if( recoveryFile.read(data, 0, pageSize) != pageSize ) {
1069 // Invalid recovery record, Could not fully read the data". Probably due to a partial write to the recovery buffer
1070 return nextTxId;
1071 }
1072 checksum.update(data, 0, pageSize);
1073 batch.put(offset, data);
1074 }
1075 } catch (Exception e) {
1076 // If an error occurred it was cause the redo buffer was not full written out correctly.. so don't redo it.
1077 // as the pages should still be consistent.
1078 LOG.debug("Redo buffer was not fully intact: ", e);
1079 return nextTxId;
1080 }
1081
1082 recoveryPageCount = pageCounter;
1083
1084 // If the checksum is not valid then the recovery buffer was partially written to disk.
1085 if( checksum.getValue() != expectedChecksum ) {
1086 return nextTxId;
1087 }
1088
1089 // Re-apply all the writes in the recovery buffer.
1090 for (Map.Entry<Long, byte[]> e : batch.entrySet()) {
1091 writeFile.seek(e.getKey());
1092 e.getValue();
1093 writeFile.write(e.getValue());
1094 }
1095
1096 // And sync it to disk
1097 writeFile.getFD().sync();
1098 return nextTxId;
1099 }
1100
1101 private void startWriter() {
1102 synchronized( writes ) {
1103 if( enabledWriteThread ) {
1104 stopWriter.set(false);
1105 writerThread = new Thread("KahaDB Page Writer") {
1106 @Override
1107 public void run() {
1108 pollWrites();
1109 }
1110 };
1111 writerThread.setPriority(Thread.MAX_PRIORITY);
1112 writerThread.setDaemon(true);
1113 writerThread.start();
1114 }
1115 }
1116 }
1117
1118 private void stopWriter() throws InterruptedException {
1119 if( enabledWriteThread ) {
1120 stopWriter.set(true);
1121 writerThread.join();
1122 }
1123 }
1124
1125 public File getFile() {
1126 return getMainPageFile();
1127 }
1128
1129 }