pprice@lsst-dev01:~/LSST/ctrl/pool[master] $ cat ~/test_dm-9541.py
|
#!/usr/bin/env python
|
|
from lsst.ctrl.pool.pool import Debugger, Pool, startPool, NODE
|
|
Debugger().enabled = True
|
|
SIZE = 2**30 + 2**29
|
NUM = 10
|
|
def func(index):
|
print "Processing %d on %s" % (index, NODE)
|
return "X"*SIZE
|
|
def main():
|
indices = list(range(NUM))
|
pool = Pool(None)
|
results = pool.map(func, indices)
|
print len(results), [len(rr) for rr in results]
|
pool.exit()
|
|
|
if __name__ == "__main__":
|
startPool()
|
main()
|
|
pprice@lsst-dev01:~/LSST/ctrl/pool[tickets/DM-9541] $ srun -N 2 --ntasks-per-node=2 -I --pty bash
|
pprice@lsst-verify-worker04:~/LSST/ctrl/pool[tickets/DM-9541] $ mpiexec python ~/test_dm-9541.py
|
Master: command reduce
|
Slave 3: waiting for command from 0
|
Slave 1: waiting for command from 0
|
Master: instruct
|
Slave 1: command reduce
|
Slave 2: waiting for command from 0
|
Slave 1: waiting for instruction
|
Slave 3: command reduce
|
Slave 2: command reduceSlave 1: waiting for job
|
|
Slave 3: waiting for instruction
|
Slave 2: waiting for instruction
|
Master: scatter initial jobs
|
Slave 3: waiting for job
|
Slave 2: waiting for job
|
Slave 2: running job
|
Processing 1 on lsst-verify-worker05:56007
|
Processing 0 on lsst-verify-worker04:72783Slave 1: running job
|
|
Slave 3: running job
|
Processing 2 on lsst-verify-worker05:56008
|
Slave 2: waiting for job
|
Master: gather from slave 2
|
Master: send job to slave 3 2
|
Processing 3 on lsst-verify-worker05:56007
|
Slave 2: running job
|
Slave 1: waiting for job
|
Master: gather from slave 1
|
Master: send job to slave 4 1
|
Processing 4 on lsst-verify-worker04:72783
|
Slave 1: running job
|
Slave 3: waiting for job
|
Master: gather from slave 3
|
Master: send job to slave 5 3
|
Slave 3: running job
|
Processing 5 on lsst-verify-worker05:56008
|
Slave 2: waiting for job
|
Master: gather from slave 2
|
Master: send job to slave 6 2
|
Processing 6 on lsst-verify-worker05:56007
|
Slave 2: running job
|
Slave 1: waiting for job
|
Master: gather from slave 1
|
Master: send job to slave 7 1
|
Processing 7 on lsst-verify-worker04:72783
|
Slave 1: running job
|
Slave 3: waiting for job
|
Master: gather from slave 3
|
Master: send job to slave 8 3
|
Slave 3: running job
|
Processing 8 on lsst-verify-worker05:56008
|
Slave 2: waiting for job
|
Master: gather from slave 2
|
Master: send job to slave 9 2
|
Processing 9 on lsst-verify-worker05:56007
|
Slave 2: running job
|
Slave 1: waiting for job
|
Master: gather from slave 1
|
Slave 1: done
|
Slave 1: waiting for command from 0
|
Slave 3: waiting for job
|
Master: gather from slave 3
|
Slave 3: done
|
Slave 3: waiting for command from 0
|
Slave 2: waiting for job
|
Master: gather from slave 2
|
Master: done
|
10 [1610612736, 1610612736, 1610612736, 1610612736, 1610612736, 1610612736, 1610612736, 1610612736, 1610612736, 1610612736]
|
Master: command exit
|
Slave 1: command exit
|
Slave 1: exiting
|
Slave 2: done
|
Slave 2: waiting for command from 0
|
Slave 2: command exit
|
Slave 2: exiting
|
Slave 3: command exit
|
Slave 3: exiting
|
pprice@lsst-verify-worker04:~/LSST/ctrl/pool[tickets/DM-9541] $ git co master
|
Switched to branch 'master'
|
Your branch is up-to-date with 'origin/master'.
|
pprice@lsst-verify-worker04:~/LSST/ctrl/pool[master] $ mpiexec python ~/test_dm-9541.py
|
Slave 2: waiting for command from 0
|
Slave 1: waiting for command from 0
|
Slave 3: waiting for command from 0
|
Master: command reduce
|
Master: instruct
|
Slave 2: command reduce
|
Slave 1: command reduce
|
Slave 1: waiting for instruction
|
Slave 3: command reduce
|
Slave 1: waiting for job
|
Slave 2: waiting for instruction
|
Master: scatter initial jobsSlave 3: waiting for instruction
|
|
Slave 2: waiting for job
|
Slave 3: waiting for job
|
Processing 0 on lsst-verify-worker04:72948
|
Processing 1 on lsst-verify-worker05:56118
|
Slave 1: running job
|
Slave 2: running job
|
Processing 2 on lsst-verify-worker05:56119
|
Slave 3: running job
|
Slave 2: waiting for job
|
Master: gather from slave 2
|
Master: send job to slave 3 2
|
Slave 1: waiting for job
|
Processing 3 on lsst-verify-worker05:56118
|
Slave 2: running job
|
Master: gather from slave 1
|
Master: send job to slave 4 1
|
Slave 1: running job
|
Processing 4 on lsst-verify-worker04:72948
|
Slave 3: waiting for job
|
Master: gather from slave 3
|
Master: send job to slave 5 3
|
Slave 3: running job
|
Processing 5 on lsst-verify-worker05:56119
|
Slave 2: waiting for job
|
Master: gather from slave 2
|
Master: send job to slave 6 2
|
Processing 6 on lsst-verify-worker05:56118
|
Slave 2: running job
|
Slave 1: waiting for job
|
Slave 3: waiting for job
|
Master: gather from slave 3
|
Master: send job to slave 7 3
|
Master: gather from slave 1
|
Master: send job to slave 8 1
|
Processing 8 on lsst-verify-worker04:72948
|
Slave 1: running job
|
Processing 7 on lsst-verify-worker05:56119
|
Slave 3: running job
|
Slave 2: waiting for job
|
Master: gather from slave 2
|
Master: send job to slave 9 2
|
Processing 9 on lsst-verify-worker05:56118
|
Slave 2: running job
|
Slave 3: waiting for job
|
Slave 1: waiting for job
|
Master: gather from slave 3
|
Master: gather from slave 1
|
Slave 2: waiting for job
|
Master: gather from slave 2
|
OverflowError on lsst-verify-worker04:72948 in run: integer 4831838248 does not fit in 'int'
|
Traceback (most recent call last):
|
File "/home/pprice/LSST/ctrl/pool/python/lsst/ctrl/pool/pool.py", line 113, in wrapper
|
return func(*args, **kwargs)
|
File "/home/pprice/LSST/ctrl/pool/python/lsst/ctrl/pool/pool.py", line 1071, in run
|
while not menu[command]():
|
File "/home/pprice/LSST/ctrl/pool/python/lsst/ctrl/pool/pool.py", line 237, in wrapper
|
return func(*args, **kwargs)
|
File "/home/pprice/LSST/ctrl/pool/python/lsst/ctrl/pool/pool.py", line 1098, in reduce
|
self.comm.gather(out, root=self.root)
|
File "MPI/Comm.pyx", line 1281, in mpi4py.MPI.Comm.gather (src/mpi4py.MPI.c:108949)
|
File "MPI/msgpickle.pxi", line 659, in mpi4py.MPI.PyMPI_gather (src/mpi4py.MPI.c:47570)
|
File "MPI/msgpickle.pxi", line 119, in mpi4py.MPI.Pickle.dump (src/mpi4py.MPI.c:40840)
|
File "MPI/msgbuffer.pxi", line 35, in mpi4py.MPI.downcast (src/mpi4py.MPI.c:29070)
|
OverflowError: integer 4831838248 does not fit in 'int'
|
application called MPI_Abort(MPI_COMM_WORLD, 1) - process 1
|
One possibility is that the size of the pickle has grown too large. Could you please post the command that produced this error? Are there any more clues (from the log, perhaps) about exactly what the data is that is being passed and causing the error?