Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * buffile.c
4 : * Management of large buffered files, primarily temporary files.
5 : *
6 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * src/backend/storage/file/buffile.c
11 : *
12 : * NOTES:
13 : *
14 : * BufFiles provide a very incomplete emulation of stdio atop virtual Files
15 : * (as managed by fd.c). Currently, we only support the buffered-I/O
16 : * aspect of stdio: a read or write of the low-level File occurs only
17 : * when the buffer is filled or emptied. This is an even bigger win
18 : * for virtual Files than for ordinary kernel files, since reducing the
19 : * frequency with which a virtual File is touched reduces "thrashing"
20 : * of opening/closing file descriptors.
21 : *
22 : * Note that BufFile structs are allocated with palloc(), and therefore
23 : * will go away automatically at transaction end. If the underlying
24 : * virtual File is made with OpenTemporaryFile, then all resources for
25 : * the file are certain to be cleaned up even if processing is aborted
26 : * by ereport(ERROR). The data structures required are made in the
27 : * palloc context that was current when the BufFile was created, and
28 : * any external resources such as temp files are owned by the ResourceOwner
29 : * that was current at that time.
30 : *
31 : * BufFile also supports temporary files that exceed the OS file size limit
32 : * (by opening multiple fd.c temporary files). This is an essential feature
33 : * for sorts and hashjoins on large amounts of data.
34 : *-------------------------------------------------------------------------
35 : */
36 :
37 : #include "postgres.h"
38 :
39 : #include "executor/instrument.h"
40 : #include "pgstat.h"
41 : #include "storage/fd.h"
42 : #include "storage/buffile.h"
43 : #include "storage/buf_internals.h"
44 : #include "utils/resowner.h"
45 :
46 : /*
47 : * We break BufFiles into gigabyte-sized segments, regardless of RELSEG_SIZE.
48 : * The reason is that we'd like large temporary BufFiles to be spread across
49 : * multiple tablespaces when available.
50 : */
51 : #define MAX_PHYSICAL_FILESIZE 0x40000000
52 : #define BUFFILE_SEG_SIZE (MAX_PHYSICAL_FILESIZE / BLCKSZ)
53 :
54 : /*
55 : * This data structure represents a buffered file that consists of one or
56 : * more physical files (each accessed through a virtual file descriptor
57 : * managed by fd.c).
58 : */
59 : struct BufFile
60 : {
61 : int numFiles; /* number of physical files in set */
62 : /* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
63 : File *files; /* palloc'd array with numFiles entries */
64 : off_t *offsets; /* palloc'd array with numFiles entries */
65 :
66 : /*
67 : * offsets[i] is the current seek position of files[i]. We use this to
68 : * avoid making redundant FileSeek calls.
69 : */
70 :
71 : bool isTemp; /* can only add files if this is TRUE */
72 : bool isInterXact; /* keep open over transactions? */
73 : bool dirty; /* does buffer need to be written? */
74 :
75 : /*
76 : * resowner is the ResourceOwner to use for underlying temp files. (We
77 : * don't need to remember the memory context we're using explicitly,
78 : * because after creation we only repalloc our arrays larger.)
79 : */
80 : ResourceOwner resowner;
81 :
82 : /*
83 : * "current pos" is position of start of buffer within the logical file.
84 : * Position as seen by user of BufFile is (curFile, curOffset + pos).
85 : */
86 : int curFile; /* file index (0..n) part of current pos */
87 : off_t curOffset; /* offset part of current pos */
88 : int pos; /* next read/write position in buffer */
89 : int nbytes; /* total # of valid bytes in buffer */
90 : char buffer[BLCKSZ];
91 : };
92 :
93 : static BufFile *makeBufFile(File firstfile);
94 : static void extendBufFile(BufFile *file);
95 : static void BufFileLoadBuffer(BufFile *file);
96 : static void BufFileDumpBuffer(BufFile *file);
97 : static int BufFileFlush(BufFile *file);
98 :
99 :
100 : /*
101 : * Create a BufFile given the first underlying physical file.
102 : * NOTE: caller must set isTemp and isInterXact if appropriate.
103 : */
104 : static BufFile *
105 23 : makeBufFile(File firstfile)
106 : {
107 23 : BufFile *file = (BufFile *) palloc(sizeof(BufFile));
108 :
109 23 : file->numFiles = 1;
110 23 : file->files = (File *) palloc(sizeof(File));
111 23 : file->files[0] = firstfile;
112 23 : file->offsets = (off_t *) palloc(sizeof(off_t));
113 23 : file->offsets[0] = 0L;
114 23 : file->isTemp = false;
115 23 : file->isInterXact = false;
116 23 : file->dirty = false;
117 23 : file->resowner = CurrentResourceOwner;
118 23 : file->curFile = 0;
119 23 : file->curOffset = 0L;
120 23 : file->pos = 0;
121 23 : file->nbytes = 0;
122 :
123 23 : return file;
124 : }
125 :
126 : /*
127 : * Add another component temp file.
128 : */
129 : static void
130 0 : extendBufFile(BufFile *file)
131 : {
132 : File pfile;
133 : ResourceOwner oldowner;
134 :
135 : /* Be sure to associate the file with the BufFile's resource owner */
136 0 : oldowner = CurrentResourceOwner;
137 0 : CurrentResourceOwner = file->resowner;
138 :
139 0 : Assert(file->isTemp);
140 0 : pfile = OpenTemporaryFile(file->isInterXact);
141 0 : Assert(pfile >= 0);
142 :
143 0 : CurrentResourceOwner = oldowner;
144 :
145 0 : file->files = (File *) repalloc(file->files,
146 0 : (file->numFiles + 1) * sizeof(File));
147 0 : file->offsets = (off_t *) repalloc(file->offsets,
148 0 : (file->numFiles + 1) * sizeof(off_t));
149 0 : file->files[file->numFiles] = pfile;
150 0 : file->offsets[file->numFiles] = 0L;
151 0 : file->numFiles++;
152 0 : }
153 :
154 : /*
155 : * Create a BufFile for a new temporary file (which will expand to become
156 : * multiple temporary files if more than MAX_PHYSICAL_FILESIZE bytes are
157 : * written to it).
158 : *
159 : * If interXact is true, the temp file will not be automatically deleted
160 : * at end of transaction.
161 : *
162 : * Note: if interXact is true, the caller had better be calling us in a
163 : * memory context, and with a resource owner, that will survive across
164 : * transaction boundaries.
165 : */
166 : BufFile *
167 23 : BufFileCreateTemp(bool interXact)
168 : {
169 : BufFile *file;
170 : File pfile;
171 :
172 23 : pfile = OpenTemporaryFile(interXact);
173 23 : Assert(pfile >= 0);
174 :
175 23 : file = makeBufFile(pfile);
176 23 : file->isTemp = true;
177 23 : file->isInterXact = interXact;
178 :
179 23 : return file;
180 : }
181 :
182 : #ifdef NOT_USED
183 : /*
184 : * Create a BufFile and attach it to an already-opened virtual File.
185 : *
186 : * This is comparable to fdopen() in stdio. This is the only way at present
187 : * to attach a BufFile to a non-temporary file. Note that BufFiles created
188 : * in this way CANNOT be expanded into multiple files.
189 : */
190 : BufFile *
191 : BufFileCreate(File file)
192 : {
193 : return makeBufFile(file);
194 : }
195 : #endif
196 :
197 : /*
198 : * Close a BufFile
199 : *
200 : * Like fclose(), this also implicitly FileCloses the underlying File.
201 : */
202 : void
203 23 : BufFileClose(BufFile *file)
204 : {
205 : int i;
206 :
207 : /* flush any unwritten data */
208 23 : BufFileFlush(file);
209 : /* close the underlying file(s) (with delete if it's a temp file) */
210 46 : for (i = 0; i < file->numFiles; i++)
211 23 : FileClose(file->files[i]);
212 : /* release the buffer space */
213 23 : pfree(file->files);
214 23 : pfree(file->offsets);
215 23 : pfree(file);
216 23 : }
217 :
218 : /*
219 : * BufFileLoadBuffer
220 : *
221 : * Load some data into buffer, if possible, starting from curOffset.
222 : * At call, must have dirty = false, pos and nbytes = 0.
223 : * On exit, nbytes is number of bytes loaded.
224 : */
225 : static void
226 1039 : BufFileLoadBuffer(BufFile *file)
227 : {
228 : File thisfile;
229 :
230 : /*
231 : * Advance to next component file if necessary and possible.
232 : *
233 : * This path can only be taken if there is more than one component, so it
234 : * won't interfere with reading a non-temp file that is over
235 : * MAX_PHYSICAL_FILESIZE.
236 : */
237 1039 : if (file->curOffset >= MAX_PHYSICAL_FILESIZE &&
238 0 : file->curFile + 1 < file->numFiles)
239 : {
240 0 : file->curFile++;
241 0 : file->curOffset = 0L;
242 : }
243 :
244 : /*
245 : * May need to reposition physical file.
246 : */
247 1039 : thisfile = file->files[file->curFile];
248 1039 : if (file->curOffset != file->offsets[file->curFile])
249 : {
250 44 : if (FileSeek(thisfile, file->curOffset, SEEK_SET) != file->curOffset)
251 1039 : return; /* seek failed, read nothing */
252 44 : file->offsets[file->curFile] = file->curOffset;
253 : }
254 :
255 : /*
256 : * Read whatever we can get, up to a full bufferload.
257 : */
258 1039 : file->nbytes = FileRead(thisfile,
259 1039 : file->buffer,
260 : sizeof(file->buffer),
261 : WAIT_EVENT_BUFFILE_READ);
262 1039 : if (file->nbytes < 0)
263 0 : file->nbytes = 0;
264 1039 : file->offsets[file->curFile] += file->nbytes;
265 : /* we choose not to advance curOffset here */
266 :
267 1039 : pgBufferUsage.temp_blks_read++;
268 : }
269 :
270 : /*
271 : * BufFileDumpBuffer
272 : *
273 : * Dump buffer contents starting at curOffset.
274 : * At call, should have dirty = true, nbytes > 0.
275 : * On exit, dirty is cleared if successful write, and curOffset is advanced.
276 : */
277 : static void
278 1057 : BufFileDumpBuffer(BufFile *file)
279 : {
280 1057 : int wpos = 0;
281 : int bytestowrite;
282 : File thisfile;
283 :
284 : /*
285 : * Unlike BufFileLoadBuffer, we must dump the whole buffer even if it
286 : * crosses a component-file boundary; so we need a loop.
287 : */
288 3171 : while (wpos < file->nbytes)
289 : {
290 : /*
291 : * Advance to next component file if necessary and possible.
292 : */
293 1057 : if (file->curOffset >= MAX_PHYSICAL_FILESIZE && file->isTemp)
294 : {
295 0 : while (file->curFile + 1 >= file->numFiles)
296 0 : extendBufFile(file);
297 0 : file->curFile++;
298 0 : file->curOffset = 0L;
299 : }
300 :
301 : /*
302 : * Enforce per-file size limit only for temp files, else just try to
303 : * write as much as asked...
304 : */
305 1057 : bytestowrite = file->nbytes - wpos;
306 1057 : if (file->isTemp)
307 : {
308 1057 : off_t availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset;
309 :
310 1057 : if ((off_t) bytestowrite > availbytes)
311 0 : bytestowrite = (int) availbytes;
312 : }
313 :
314 : /*
315 : * May need to reposition physical file.
316 : */
317 1057 : thisfile = file->files[file->curFile];
318 1057 : if (file->curOffset != file->offsets[file->curFile])
319 : {
320 6 : if (FileSeek(thisfile, file->curOffset, SEEK_SET) != file->curOffset)
321 0 : return; /* seek failed, give up */
322 6 : file->offsets[file->curFile] = file->curOffset;
323 : }
324 1057 : bytestowrite = FileWrite(thisfile,
325 1057 : file->buffer + wpos,
326 : bytestowrite,
327 : WAIT_EVENT_BUFFILE_WRITE);
328 1057 : if (bytestowrite <= 0)
329 0 : return; /* failed to write */
330 1057 : file->offsets[file->curFile] += bytestowrite;
331 1057 : file->curOffset += bytestowrite;
332 1057 : wpos += bytestowrite;
333 :
334 1057 : pgBufferUsage.temp_blks_written++;
335 : }
336 1057 : file->dirty = false;
337 :
338 : /*
339 : * At this point, curOffset has been advanced to the end of the buffer,
340 : * ie, its original value + nbytes. We need to make it point to the
341 : * logical file position, ie, original value + pos, in case that is less
342 : * (as could happen due to a small backwards seek in a dirty buffer!)
343 : */
344 1057 : file->curOffset -= (file->nbytes - file->pos);
345 1057 : if (file->curOffset < 0) /* handle possible segment crossing */
346 : {
347 0 : file->curFile--;
348 0 : Assert(file->curFile >= 0);
349 0 : file->curOffset += MAX_PHYSICAL_FILESIZE;
350 : }
351 :
352 : /*
353 : * Now we can set the buffer empty without changing the logical position
354 : */
355 1057 : file->pos = 0;
356 1057 : file->nbytes = 0;
357 : }
358 :
359 : /*
360 : * BufFileRead
361 : *
362 : * Like fread() except we assume 1-byte element size.
363 : */
364 : size_t
365 362167 : BufFileRead(BufFile *file, void *ptr, size_t size)
366 : {
367 362167 : size_t nread = 0;
368 : size_t nthistime;
369 :
370 362167 : if (file->dirty)
371 : {
372 0 : if (BufFileFlush(file) != 0)
373 0 : return 0; /* could not flush... */
374 0 : Assert(!file->dirty);
375 : }
376 :
377 1086713 : while (size > 0)
378 : {
379 362399 : if (file->pos >= file->nbytes)
380 : {
381 : /* Try to load more data into buffer. */
382 1039 : file->curOffset += file->pos;
383 1039 : file->pos = 0;
384 1039 : file->nbytes = 0;
385 1039 : BufFileLoadBuffer(file);
386 1039 : if (file->nbytes <= 0)
387 20 : break; /* no more data available */
388 : }
389 :
390 362379 : nthistime = file->nbytes - file->pos;
391 362379 : if (nthistime > size)
392 361361 : nthistime = size;
393 362379 : Assert(nthistime > 0);
394 :
395 362379 : memcpy(ptr, file->buffer + file->pos, nthistime);
396 :
397 362379 : file->pos += nthistime;
398 362379 : ptr = (void *) ((char *) ptr + nthistime);
399 362379 : size -= nthistime;
400 362379 : nread += nthistime;
401 : }
402 :
403 362167 : return nread;
404 : }
405 :
406 : /*
407 : * BufFileWrite
408 : *
409 : * Like fwrite() except we assume 1-byte element size.
410 : */
411 : size_t
412 382150 : BufFileWrite(BufFile *file, void *ptr, size_t size)
413 : {
414 382150 : size_t nwritten = 0;
415 : size_t nthistime;
416 :
417 1146703 : while (size > 0)
418 : {
419 382403 : if (file->pos >= BLCKSZ)
420 : {
421 : /* Buffer full, dump it out */
422 1028 : if (file->dirty)
423 : {
424 1028 : BufFileDumpBuffer(file);
425 1028 : if (file->dirty)
426 0 : break; /* I/O error */
427 : }
428 : else
429 : {
430 : /* Hmm, went directly from reading to writing? */
431 0 : file->curOffset += file->pos;
432 0 : file->pos = 0;
433 0 : file->nbytes = 0;
434 : }
435 : }
436 :
437 382403 : nthistime = BLCKSZ - file->pos;
438 382403 : if (nthistime > size)
439 381367 : nthistime = size;
440 382403 : Assert(nthistime > 0);
441 :
442 382403 : memcpy(file->buffer + file->pos, ptr, nthistime);
443 :
444 382403 : file->dirty = true;
445 382403 : file->pos += nthistime;
446 382403 : if (file->nbytes < file->pos)
447 382403 : file->nbytes = file->pos;
448 382403 : ptr = (void *) ((char *) ptr + nthistime);
449 382403 : size -= nthistime;
450 382403 : nwritten += nthistime;
451 : }
452 :
453 382150 : return nwritten;
454 : }
455 :
456 : /*
457 : * BufFileFlush
458 : *
459 : * Like fflush()
460 : */
461 : static int
462 73 : BufFileFlush(BufFile *file)
463 : {
464 73 : if (file->dirty)
465 : {
466 29 : BufFileDumpBuffer(file);
467 29 : if (file->dirty)
468 0 : return EOF;
469 : }
470 :
471 73 : return 0;
472 : }
473 :
474 : /*
475 : * BufFileSeek
476 : *
477 : * Like fseek(), except that target position needs two values in order to
478 : * work when logical filesize exceeds maximum value representable by long.
479 : * We do not support relative seeks across more than LONG_MAX, however.
480 : *
481 : * Result is 0 if OK, EOF if not. Logical position is not moved if an
482 : * impossible seek is attempted.
483 : */
484 : int
485 1392 : BufFileSeek(BufFile *file, int fileno, off_t offset, int whence)
486 : {
487 : int newFile;
488 : off_t newOffset;
489 :
490 1392 : switch (whence)
491 : {
492 : case SEEK_SET:
493 1392 : if (fileno < 0)
494 0 : return EOF;
495 1392 : newFile = fileno;
496 1392 : newOffset = offset;
497 1392 : break;
498 : case SEEK_CUR:
499 :
500 : /*
501 : * Relative seek considers only the signed offset, ignoring
502 : * fileno. Note that large offsets (> 1 gig) risk overflow in this
503 : * add, unless we have 64-bit off_t.
504 : */
505 0 : newFile = file->curFile;
506 0 : newOffset = (file->curOffset + file->pos) + offset;
507 0 : break;
508 : #ifdef NOT_USED
509 : case SEEK_END:
510 : /* could be implemented, not needed currently */
511 : break;
512 : #endif
513 : default:
514 0 : elog(ERROR, "invalid whence: %d", whence);
515 : return EOF;
516 : }
517 2784 : while (newOffset < 0)
518 : {
519 0 : if (--newFile < 0)
520 0 : return EOF;
521 0 : newOffset += MAX_PHYSICAL_FILESIZE;
522 : }
523 2784 : if (newFile == file->curFile &&
524 2749 : newOffset >= file->curOffset &&
525 1357 : newOffset <= file->curOffset + file->nbytes)
526 : {
527 : /*
528 : * Seek is to a point within existing buffer; we can just adjust
529 : * pos-within-buffer, without flushing buffer. Note this is OK
530 : * whether reading or writing, but buffer remains dirty if we were
531 : * writing.
532 : */
533 1342 : file->pos = (int) (newOffset - file->curOffset);
534 1342 : return 0;
535 : }
536 : /* Otherwise, must reposition buffer, so flush any dirty data */
537 50 : if (BufFileFlush(file) != 0)
538 0 : return EOF;
539 :
540 : /*
541 : * At this point and no sooner, check for seek past last segment. The
542 : * above flush could have created a new segment, so checking sooner would
543 : * not work (at least not with this code).
544 : */
545 50 : if (file->isTemp)
546 : {
547 : /* convert seek to "start of next seg" to "end of last seg" */
548 50 : if (newFile == file->numFiles && newOffset == 0)
549 : {
550 0 : newFile--;
551 0 : newOffset = MAX_PHYSICAL_FILESIZE;
552 : }
553 100 : while (newOffset > MAX_PHYSICAL_FILESIZE)
554 : {
555 0 : if (++newFile >= file->numFiles)
556 0 : return EOF;
557 0 : newOffset -= MAX_PHYSICAL_FILESIZE;
558 : }
559 : }
560 50 : if (newFile >= file->numFiles)
561 0 : return EOF;
562 : /* Seek is OK! */
563 50 : file->curFile = newFile;
564 50 : file->curOffset = newOffset;
565 50 : file->pos = 0;
566 50 : file->nbytes = 0;
567 50 : return 0;
568 : }
569 :
570 : void
571 14 : BufFileTell(BufFile *file, int *fileno, off_t *offset)
572 : {
573 14 : *fileno = file->curFile;
574 14 : *offset = file->curOffset + file->pos;
575 14 : }
576 :
577 : /*
578 : * BufFileSeekBlock --- block-oriented seek
579 : *
580 : * Performs absolute seek to the start of the n'th BLCKSZ-sized block of
581 : * the file. Note that users of this interface will fail if their files
582 : * exceed BLCKSZ * LONG_MAX bytes, but that is quite a lot; we don't work
583 : * with tables bigger than that, either...
584 : *
585 : * Result is 0 if OK, EOF if not. Logical position is not moved if an
586 : * impossible seek is attempted.
587 : */
588 : int
589 1371 : BufFileSeekBlock(BufFile *file, long blknum)
590 : {
591 2742 : return BufFileSeek(file,
592 1371 : (int) (blknum / BUFFILE_SEG_SIZE),
593 1371 : (off_t) (blknum % BUFFILE_SEG_SIZE) * BLCKSZ,
594 : SEEK_SET);
595 : }
596 :
597 : #ifdef NOT_USED
598 : /*
599 : * BufFileTellBlock --- block-oriented tell
600 : *
601 : * Any fractional part of a block in the current seek position is ignored.
602 : */
603 : long
604 : BufFileTellBlock(BufFile *file)
605 : {
606 : long blknum;
607 :
608 : blknum = (file->curOffset + file->pos) / BLCKSZ;
609 : blknum += file->curFile * BUFFILE_SEG_SIZE;
610 : return blknum;
611 : }
612 :
613 : #endif
|