strace log from the failure:
open("/home/travis/apprise-api/.coverage", O_RDWR|O_CREAT|O_CLOEXEC, 0644) = 31
fstat(31, {st_mode=S_IFREG|0644, st_size=192512, ...}) = 0
fstat(31, {st_mode=S_IFREG|0644, st_size=192512, ...}) = 0
lseek(31, 0, SEEK_SET) = 0
read(31, "SQLite format 3\0\4\0\1\1\0@ \0\0\0\n\0\0\0\274"..., 100) = 100
write(3, "00001299 0001 6169 execute('prag"..., 54) = 54
write(3, "Executing 'pragma journal_mode=o"..., 36) = 36
write(3, "self: <SqliteDb @0x7fa0feef9fd0 "..., 86) = 86
fcntl(31, F_SETLK, {l_type=F_RDLCK, l_whence=SEEK_SET, l_start=1073741824, l_len=1}) = 0
fcntl(31, F_SETLK, {l_type=F_RDLCK, l_whence=SEEK_SET, l_start=1073741826, l_len=510}) = 0
fcntl(31, F_SETLK, {l_type=F_UNLCK, l_whence=SEEK_SET, l_start=1073741824, l_len=1}) = 0
fstat(31, {st_mode=S_IFREG|0644, st_size=192512, ...}) = 0
fstat(31, {st_mode=S_IFREG|0644, st_size=192512, ...}) = 0
lseek(31, 0, SEEK_SET) = 0
read(31, "SQLite format 3\0\4\0\1\1\0@ \0\0\0\n\0\0\0\274"..., 1024) = 1024
lseek(31, 8192, SEEK_SET) = 8192
read(31, "\r\3-\0\4\0\316\0\3^\1X\0035\0\316\0\245\0v\0v\0\0\0\0\0\0\0\0\0\0"..., 1024) = 1024
lseek(31, 9216, SEEK_SET) = 9216
read(31, "\r\0\0\0\4\0\370\0\0\370\3d\0035\1!\0\277\0\277\2\36\0\0\0\0\0\0\0\0\0\0"..., 1024) = 1024
lseek(31, 14336, SEEK_SET) = 14336
read(31, "\r\0\0\0\4\0\320\0\3\315\1\313\1\244\0\320\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1024) = 1024
fcntl(31, F_SETLK, {l_type=F_UNLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = 0
write(3, "00001299 0001 6169 execute retur"..., 76) = 76
write(3, "00001299 0001 6170 execute('prag"..., 53) = 53
write(3, "Executing 'pragma synchronous=of"..., 35) = 35
lseek(31, 0, SEEK_CUR) = 15360
close(31) = 0
write(3, "self: <SqliteDb @0x7fa0feef9fd0 "..., 86) = 86
write(3, "00001299 0001 6170 execute retur"..., 76) = 76
write(3, "00001299 0001 6168 _connect retu"..., 40) = 40
write(3, "00001299 0001 6167 __enter__ ret"..., 116) = 116
write(3, "00001299 0001 6171 execute('sele"..., 82) = 82
write(3, "Executing 'select tracer from tr"..., 68) = 68
write(3, "self: <SqliteDb @0x7fa0feef9fd0 "..., 86) = 86
fcntl(31, F_SETLK, {l_type=F_RDLCK, l_whence=SEEK_SET, l_start=1073741824, l_len=1}) = -1 EBADF (Bad file descriptor)
open("/home/travis/apprise-api/.coverage", O_RDONLY|O_CLOEXEC) = 31
fstat(31, {st_mode=S_IFREG|0644, st_size=192512, ...}) = 0
ioctl(31, TCGETS, 0x7fff3bfa4920) = -1 ENOTTY (Inappropriate ioctl for device)
lseek(31, 0, SEEK_CUR) = 0
read(31, "SQLite format 3\0\4\0\1\1\0@ \0\0\0\n\0\0\0\274"..., 4096) = 4096
close(31) = 0
write(3, "EXCEPTION from execute: disk I/O"..., 39) = 39
write(3, "self: <SqliteDb @0x7fa0feef9fd0 "..., 86) = 86
write(3, "00001299 0001 6172 __exit__(<cla"..., 208) = 208
write(3, "00001299 0001 6173 close()\n", 27) = 27
fstat(31, 0x7fff3bfa4220) = -1 EBADF (Bad file descriptor)
close(31) = -1 EBADF (Bad file descriptor)
Notice the close(31) right after the lseek. This close(31) is not generated by SQLite; rather, it's being generated by the GC finalization of an unrelated file object: #0 internal_close (self=<optimized out>) at ./Modules/_io/fileio.c:126
#1 _io_FileIO_close_impl (self=<optimized out>) at ./Modules/_io/fileio.c:171
#2 _io_FileIO_close (self=<optimized out>, _unused_ignored=<optimized out>) at ./Modules/_io/clinic/fileio.c.h:23
#3 0x00007ffff7975be7 in _PyCFunction_FastCallDict (func_obj=func_obj@entry=0x7fffeefaee58, args=args@entry=0x7fffffff7590, nargs=0, kwargs=kwargs@entry=0x0) at Objects/methodobject.c:192
#4 0x00007ffff791b89e in _PyObject_FastCallDict (func=func@entry=0x7fffeefaee58, args=args@entry=0x7fffffff7590, nargs=<optimized out>, kwargs=kwargs@entry=0x0) at Objects/abstract.c:2313
#5 0x00007ffff791c8a4 in PyObject_CallMethodObjArgs (callable=0x7fffeefaee58, name=<optimized out>) at Objects/abstract.c:2759
#6 0x00007ffff7a976b4 in buffered_close (self=0x7fffee02ae08, args=<optimized out>) at ./Modules/_io/bufferedio.c:538
#7 0x00007ffff7975be7 in _PyCFunction_FastCallDict (func_obj=func_obj@entry=0x7fffedf7eab0, args=args@entry=0x7fffffff77c0, nargs=0, kwargs=kwargs@entry=0x0) at Objects/methodobject.c:192
#8 0x00007ffff791b89e in _PyObject_FastCallDict (func=func@entry=0x7fffedf7eab0, args=args@entry=0x7fffffff77c0, nargs=<optimized out>, kwargs=kwargs@entry=0x0) at Objects/abstract.c:2313
#9 0x00007ffff791c8a4 in PyObject_CallMethodObjArgs (callable=0x7fffedf7eab0, callable@entry=0x7fffee02ae08, name=<optimized out>) at Objects/abstract.c:2759
#10 0x00007ffff7a90def in iobase_finalize (self=0x7fffee02ae08) at ./Modules/_io/iobase.c:266
#11 0x00007ffff7a59e4f in finalize_garbage (collectable=0x7fffffff7950) at Modules/gcmodule.c:806
#12 collect (generation=generation@entry=1, n_collected=n_collected@entry=0x7fffffff79f0, n_uncollectable=n_uncollectable@entry=0x7fffffff79f8, nofail=nofail@entry=0) at Modules/gcmodule.c:1005
#13 0x00007ffff7a5a63b in collect_with_callback (generation=1) at Modules/gcmodule.c:1128
#14 0x00007ffff7a5ae8b in collect_generations () at Modules/gcmodule.c:1151
#15 _PyObject_GC_Alloc (basicsize=<optimized out>, use_calloc=0) at Modules/gcmodule.c:1729
#16 _PyObject_GC_Malloc (basicsize=<optimized out>) at Modules/gcmodule.c:1739
#17 0x00007ffff7a5afdd in _PyObject_GC_New (tp=tp@entry=0x7ffff7d53ea0 <PyGen_Type>) at Modules/gcmodule.c:1751
#18 0x00007ffff794766f in gen_new_with_qualname (qualname=0x7ffff4c1b450, name=0x7ffff4c218f0, f=0x7ffff292daf8, type=0x7ffff7d53ea0 <PyGen_Type>) at Objects/genobject.c:802
#19 PyGen_NewWithQualName (f=0x7ffff292daf8, name=0x7ffff4c218f0, qualname=0x7ffff4c1b450) at Objects/genobject.c:830
#20 0x00007ffff7a07f31 in _PyEval_EvalCodeWithName (_co=0x7ffff4c23270, globals=globals@entry=0x7ffff4cdd9d8, locals=locals@entry=0x0, args=<optimized out>, argcount=1, kwnames=kwnames@entry=0x0, kwargs=0x7ffff4c32930,
kwcount=0, kwstep=1, defs=0x0, defcount=0, kwdefs=0x0, closure=0x7fffede2cfd0, name=0x7ffff4c218f0, qualname=0x7ffff4c1b450) at Python/ceval.c:4150
#21 0x00007ffff7a081d7 in fast_function (kwnames=0x0, nargs=<optimized out>, stack=<optimized out>, func=0x7fffedc03f28) at Python/ceval.c:4978
#22 call_function (pp_stack=pp_stack@entry=0x7fffffff7c60, oparg=oparg@entry=1, kwnames=kwnames@entry=0x0) at Python/ceval.c:4858
#23 0x00007ffff7a0a633 in _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>) at Python/ceval.c:3335
Note that normal SQLite3 closes look like this: #0 0x00007ffff769b560 in __close_nocancel () from /lib/x86_64-linux-gnu/libpthread.so.0
#1 0x00007ffff3a23d8c in robust_close (lineno=29378, h=<optimized out>, pFile=0x113cfd8) at sqlite3.c:28631
#2 closeUnixFile (id=id@entry=0x113cfd8) at sqlite3.c:29378
#3 0x00007ffff3a24f43 in unixClose (id=0x113cfd8) at sqlite3.c:29427
#4 0x00007ffff3a4858b in sqlite3OsClose (pId=0x113cfd8) at sqlite3.c:17944
#5 sqlite3PagerClose (pPager=0x113ce68) at sqlite3.c:47961
#6 0x00007ffff3a5930b in sqlite3BtreeClose (p=0xfb8088) at sqlite3.c:58266
#7 0x00007ffff3a59501 in sqlite3LeaveMutexAndCloseZombie (db=0x1260598) at sqlite3.c:134102
#8 0x00007ffff3a59ae2 in sqlite3Close (db=0x1260598, forceZombie=0) at sqlite3.c:134045
#9 0x00007ffff3cc8fe2 in pysqlite_connection_close (self=0x7fffedf5fd50, args=<optimized out>) at /tmp/python-build.20181021062245.3423/Python-3.6.7/Modules/_sqlite/connection.c:337
#10 0x00007ffff7975be7 in _PyCFunction_FastCallDict (func_obj=0x7fffedf7eab0, args=0x7fffede2e890, nargs=0, kwargs=kwargs@entry=0x0) at Objects/methodobject.c:192
#11 0x00007ffff7975eb7 in _PyCFunction_FastCallKeywords (func=func@entry=0x7fffedf7eab0, stack=stack@entry=0x7fffede2e890, nargs=<optimized out>, kwnames=kwnames@entry=0x0) at Objects/methodobject.c:294
#12 0x00007ffff7a082c1 in call_function (pp_stack=pp_stack@entry=0x7fffffff9640, oparg=oparg@entry=0, kwnames=kwnames@entry=0x0) at Python/ceval.c:4837
EDIT 1: OK, I found the line of code that allocates this file, at line 398 in apprise_api/api/views.py: with NamedTemporaryFile() as f:
# Write our content to disk
f.write(config.encode())
f.flush()
If you disable this block of code everything runs fine (although the tests fail). It's kinda creepy that the NamedTemporaryFile is leaking - it really shouldn't...The test harness uses mock to cause NamedTemporaryFile to fail in a few places:
with patch('tempfile._TemporaryFileWrapper') as mock_ntf:
mock_ntf.side_effect = OSError()
Unfortunately tempfile.NamedTemporaryFile is not properly set up to detect failures from tempfile._TemporaryFileWrapper: try:
file = _io.open(fd, mode, buffering=buffering,
newline=newline, encoding=encoding)
return _TemporaryFileWrapper(file, name, delete)
except BaseException:
_os.unlink(name)
_os.close(fd)
raise
Notice how `file` itself isn't cleaned up in the event that _TemporaryFileWrapper throws; instead, the underlying `fd` is closed directly. If `file` was actually opened correctly, it will now dangle, and some time later `file` will get GCed, attempt to close its fd, and fireworks ensue.I don't know who's fault this ultimately is, but it does illustrate a possible danger of mock testing when messing with the internals of other libraries.
Both pieces of code are broken. "except BaseException" is wrong because it'll catch KeyboardInterrupt and such, after the file object exists. The mock is wrong because it's introducing an exception that the original code can never raise and was written to assume wouldn't be raised, thus turning a corner case rare bug (fd reuse when a user ^Cs a program) into a bigger problem.
The tempfile code should change to only catch Exceptions, not BaseExceptions; it may opt to additionally catch all BaseExceptions and only unlink the name, but that's probably not sufficient anyway since a KeyboardInterrupt may be raised before the whole try block. Really, the unlink should happen in a wider scope try block, and the _os.close should be more tightly constrained to the _io.open call. And the exception matching should be narrower, because it's safer to leak an fd than to accidentally close it twice.
Then either the test harness should find some other way to implement that, or they can ask the tempfile people to move the _TemporaryFileWrapper out of the try block so they can keep using that hook point, if it hasn't been already by that point.
> I don't know who's fault this ultimately is, but it does illustrate a possible danger of mock testing when messing with the internals of other libraries.
It doesn't look like there's any way that _TemporaryFileWrapper() could throw an OSError. I'd consider that mock a bug.
macOS and probably iOS has an undocumented set of filesystem calls which require a "cookie." You call open() and provide a large integer of your choice (probably random); to close the file you need to provide the same integer. This fixed a lot of crashes in CoreData, where some rando was closing its SQLite fd.
I want to learn more. Is there anything I can read about this? (I tried searching but I just keep on getting stuff about HTTP cookies not this.)
This is what the gdb script looks like:
set height 0
catch syscall close
catch syscall read
catch syscall lseek
disable 1 2
commands 2
disable 1 2
continue
end
commands 3
if $rdi == 31
enable 1 2
continue
else
continue
end
end
The lseek catchpoint (3) enables both read and close catchpoints; if the read catchpoint (2) is hit first it disables both and continues. This way we look for lseek followed by close without intervening reads.It generates a few false positives but otherwise fairly quickly stops on the right syscall, at which point I could backtrace and prod the live program.
We recently upgraded to celery 4 which had a side effect of file descriptors leaking between master and child processes, and database fds were being used for redis. Still no idea how it's happening, but the linked PEP seems to suggest there may be issues that I don't quite understand.
My guess is the tempfile is leaking due to the attribute caching here https://github.com/python/cpython/blob/master/Lib/tempfile.p... . But that doesn't really explain why the file is getting closed twice. (Presuming it's getting closed twice as otherwise the file descriptor wouldn't be reallocated)
Edit: realised it's just attaching the closer object to the function not the full object so avoiding this.
My gut instinct would be to rewrite the section OP found as a standard NamedTemporaryFile instantiation, then close it manually. The __getattr__ hack feels like a real code smell.
The second step was to figure out which FileIO object was being GCed. I ran the program again and broke inside the FileIO __init__ function (which is written in C), conditioned on the fd being 31. From that breakpoint I did a Python backtrace (using the tools from https://github.com/python/cpython/blob/master/Misc/gdbinit) to figure out the Python code that was responsible for the allocation.
The third step was futzing around with the NamedTemporaryFile code until I could determine why it was leaking a file. This took a little while because I did not initially realize that it had been mocked. Once I did, the solution presented itself.
https://forums.unraid.net/bug-reports/prereleases/sqlite-dat...
From that report:
====== 8< ======
> In the Linux block layer each READ or WRITE can have various modifier bits set. In the case of a read-ahead you get READ|REQ_RAHEAD which tells I/O driver this is a read-ahead. In this case, if there are insufficient resources at the time this request is received, the driver is permitted to terminate the operation with BLK_STS_IOERR status. Here is an example in Linux md/raid5 driver.
> In case of Unraid it can definitely happen under heavy load that a read-ahead comes along and there are no 'stripe buffers' immediately available. In this case, instead of making calling process wait, it terminated the I/O. This has worked this way for years.
[...]
> What I suspect is that this is a bug in SQLite - I think SQLite is using direct-I/O (bypassing page cache) and issuing it's own read-aheads and their logic to handle failing read-ahead is broken. But I did not follow that rabbit hole - too many other problems to work on :/
====== 8< ======
Some related bugs they brought up:
https://bugzilla.kernel.org/show_bug.cgi?id=201685
https://patchwork.kernel.org/patch/10712695/
Edit: it appears that there's _something_ causing corruption when drivers fail read-ahead I/O. Whether it's SQLite or something else in Linux is another question.
Should be pretty easy to confirm/deny that suspicion and see if it's worth going down the rabbit hole, no?
strace -e open -o >(grep O_DIRECT) ./myprogramBug #915: Please help tells me nothing.
I refer to them as "help-me zombies". They have no brains and they want to eat yours. They don't know what they need help with, but they need help.
(Edit: Original title has been changed, so this comment is no longer necessary.)
I don't mean it to suggest that Ned has a lessor role than creator, just that the post's title was changed to add inaccurate information. I figured that if the moderators saw this and decided that the context of the author's role in Coverage was important, they could update the title to reflect his actual role.
[1] https://github.com/nedbat/coveragepy/blob/master/CONTRIBUTOR...
self.execute("pragma journal_mode=off").close()
# This pragma makes writing faster.
self.execute("pragma synchronous=off").close()
I've always used WAL journaling mode, and NORMAL synchronous mode.Even if removing those pragmas fixed things, it feels likely that would just be hiding the issue instead of fixing it.
Disk io errors are not supposed to happen in sqlite except in cases like: accessing a corrupted database, drives failing or running out of disk space, bad permissions on some folder sqlite needs to write to.
So unless the test case involves a python process crashing or getting killed, I question if this is actually a bug in your your code.
I feel like this being some bug in either sqlite, or in the file systems used by docker (specifically by behaving in some manner not anticipated by sqlite).
With those optimizations, the only remaining sqlite disk IO should be the database itself, and the temp store. Since I stronly suspect the error you are getting is coming from sqlite, it would be helpful to know which of those two disk io cases it is.
Perhaps you can try the pragma to force the temp store to be in memory? The results of that might help us know if the disk io error is in reading/writing the database file itself, or if it is in reading/writing temp files.
Knowing that could narrow things down a bit.
Is there any possibility e.g. of an FD reuse issue in the code somewhere? It might even be possible to spot it with strace (although you need to start the container with the right caps to try that). An example where this can happen is programs closing 'stderr' fd 2, only for e.g. SQLite to open its database reusing fd 2, then some library code calls fprintf(stderr, ..) etc.
Would love a copy of the DB 'sometime before' and after to look at. There are tools around for inspecting SQLite databases on a page-by-page basis
https://bitbucket.org/ned/coveragepy/src/default/ original project https://nedbatchelder.com/blog/201405/github_monoculture.htm... original post https://news.ycombinator.com/item?id=7690897 HN comments
From https://github.com/nedbat/coveragepy/commit/e4b8389b27ce4e76...
"Put the filename calc back in _connect. Fixes #916
It was moved to __init__ to avoid recalculating, but the directory could have changed, so we need to wait to do the work. Instead, only do the relpath on systems that need it (Windows Py 2)."
https://bugs.python.org/issue16379
There an uncommitted patch to get the extended error in this other, related issue: https://bugs.python.org/issue24139
Or maybe just try cffi or ctypes in python to call sqlite3_errcode(), sqlite3_extended_errcode() and sqlite3_errmsg(). The incantation to find an already linked sqlite3 lib:
import ctypes
import ctypes.util
sqlite = ctypes.CDLL(ctypes.util.find_library('sqlite3'))
> HAVE NO TOOLS BECAUSE I’VE DESTROYED MY TOOLS WITH MY TOOLS
https://www.usenix.org/system/files/1311_05-08_mickens.pdf )
"Put the filename calc back in _connect. Fixes #916
It was moved to __init__ to avoid recalculating, but the directory could have changed, so we need to wait to do the work. Instead, only do the relpath on systems that need it (Windows Py 2)."