Commit 5a749421 authored by Peter-Bernd Otte's avatar Peter-Bernd Otte

rank 0 participates as worker

parent 83deb81b
......@@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
# Workload Distributor
# Version 0.1
# Version 0.2
# Minimal python version 3.6
# on Mogon 2: run first
......@@ -13,18 +13,20 @@ import platform
import multiprocessing #to get number of cores
import argparse
import subprocess
from time import sleep
import time
import os
import datetime
import logging
from threading import Thread
logging.basicConfig(level=logging.WARNING,format='%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s')
logger = logging.getLogger('workload-manager')
size = comm.Get_size()
rank = comm.Get_rank()
logging.basicConfig(level=logging.WARNING,format='%(asctime)-15s %(name)-6s %(levelname)-8s %(message)s')
logger = logging.getLogger('rank'+str(rank))
if size<2:
logger.error("Minimum number of MPI ranks = 2, please run this script with the appropriate launcher. eg mpirun -n 4 ./ [execname]")
......@@ -32,18 +34,17 @@ if size<2:
parser = argparse.ArgumentParser(description='Workload distributor for trivial parallelism.')
parser.add_argument('execname', help='name of executable to call')
parser.add_argument("-v", "--verbose", help="increase output verbosity", default=False, action="store_true")
parser.add_argument("-V", "--version", help="Returns the actual program version", action="version", version='%(prog)s 0.1')
parser.add_argument("-V", "--version", help="Returns the actual program version", action="version", version='%(prog)s 0.2')
parser.add_argument("-d", "--delay", help="time delay in ms between starts of consecutive workers", default=50, type=int)
parser.add_argument("inputdir", help="specifies the directory with files to process")
parser.add_argument("inputdir", help="specifies the directory with files to process", default=".")
clients = {}
if rank == 0:
args = parser.parse_args()
if args.verbose:
clients = {}
if rank == 0:"Verbosity turned on")"Input dir: "+args.inputdir)"Delay between files: "+str(args.delay)+" ms")
......@@ -66,56 +67,122 @@ if rank == 0:
if args.verbose:"output directory created.")'Total number of workers '+str(size-1) )
for i in range(1,size):
clients[i] = comm.recv(source=i)'Total number of workers '+str(size) )
##### Send Data to master rank
# replace later with: MPI_Gathering
nodeinfo = {'machine':platform.machine(), 'hostname': platform.node(), 'Ncores': multiprocessing.cpu_count(), 'os': platform.system(), 'pythonVersion': platform.python_version() }
h5 = comm.isend(nodeinfo, dest=0, tag=5)
if rank ==0:
h3 = {}
for i in range(0,size):
clients[i] = comm.recv(source=i, tag=5)
### Distribute Settings
# Replace later with MPI_Broadcasting
if rank ==0:
for i in range(0,size):
#distribute settings
if args.verbose:
if settings["verbose"]:
logger.setLevel(logging.DEBUG)"Worker "+str(rank)+" initialised.")
if rank ==0:
for i in range(0,size):
h3[i].wait()"All clients active. Printing clients details:")
for i in range(len(worklist)):
if args.verbose:"ToDo: "+worklist[i])"Wait for process...")
rec = comm.recv(tag=2) # get the rank of free worker
comm.send({"workstr":args.execname + " " + args.inputdir+"/"+worklist[i] + " "+outputdir+"/"+str(i)+"/outfile.txt" ,"jobid":i,"outputdir":outputdir}, dest=rec)
for i in range(1,size): # End all workers by sending an empty task
comm.send({"workstr":""}, i)
if args.verbose:"Controlling master ended.")
#### Go into working loop
WLPosition = 0
SenderFSM = 0 #0=Preparing for receive, 1=waiting between jobs, 2=waiting for rank to ask for taks, 5=Send empty taks to all
WorkerFSM = 0 #0=Send a request to Master, 1=wait for sending to comnplete, prepare for answer, 2=waiting for answer, start processing, 3=running, 4=completed and reset
SenderLastJobTimeStamp = 0 # to allow for delays between launched jobs
while True:
# Distribution stuff
if (rank == 0):
if SenderFSM==0:
if (WLPosition < len(worklist)): # still some work to be distributed
r2 = comm.irecv(tag=2) # wait for a worker to connect and get the rank of the free worker"Waiting for some worker to connect...")
SenderFSM = 1
else: #no more work in the list
SenderFSM = 5
for i in range(0,size): # End all workers by sending an empty task"Sending an empty task to rank "+str(i)+"...")
comm.send({"workstr":""}, i, tag=4)
if SenderFSM == 1: #delay time between jobs
if (time.time() - SenderLastJobTimeStamp > args.delay/1000):
SenderFSM = 2
nodeinfo = {'machine':platform.machine(), 'hostname': platform.node(), 'Ncores': multiprocessing.cpu_count(), 'os': platform.system(), 'pythonVersion': platform.python_version() }
comm.send(nodeinfo, dest=0)
if settings["verbose"]:
print ("Worker "+str(rank)+" started.")
if SenderFSM == 2:
if r2.Get_status():
rec = r2.wait()
SenderFSM = 0
while True:
comm.send(rank, dest=0, tag=2) # send a data package asking for more work
dic = comm.recv(source=0)
i = WLPosition"ToDo: "+worklist[i])"Send a new job to rank "+str(rec)+"...")
workstr = args.execname + " " + args.inputdir+"/"+worklist[i] + " "+outputdir+"/"+str(i)+"/outfile.txt"
# workstr = args.execname
s4 = comm.isend({"workstr":workstr, "jobid":i,"outputdir":outputdir}, dest=rec, tag=4)
WLPosition += 1
SenderLastJobTimeStamp = time.time()
# Worker itself
if WorkerFSM==0:
h2 = comm.isend(rank, dest=0, tag=2) # send a data package asking for more work
WorkerFSM = 1
if WorkerFSM==1:
if h2.Test():
r4 = comm.irecv(source=0, tag=4)
WorkerFSM = 2
if WorkerFSM==2:
if r4.Get_status():
dic = r4.wait()
workstr = dic["workstr"]
if len(workstr)==0: #check if no more to do from master
outputdir = dic["outputdir"]
jobid = dic["jobid"]
if settings["verbose"]:
print("Worker "+str(rank)+" recieved (internal job nr="+str(jobid)+"): "+workstr)"Worker "+str(rank)+" recieved (internal job nr="+str(jobid)+"): "+workstr)
bashCommand = workstr + " > "+outputdir+"/"+str(jobid)+"/std_out.txt 2> "+outputdir+"/"+str(jobid)+"/err_out.txt"
completed =, shell=True)
if settings["verbose"]:
print('Worker '+str(rank)+' returncode: ',completed.returncode)
p = subprocess.Popen(bashCommand, shell=True)'Worker startet job. ')
except subprocess.CalledProcessError as err:
if settings["verbose"]:
print('Worker '+str(rank)+' ERROR:', err)'Worker '+str(rank)+' ERROR:', err)
WorkerFSM = 3
if WorkerFSM == 3: # Running
if p.poll() is not None:"Process ended, ret code: "+str(p.returncode))
WorkerFSM = 4
if WorkerFSM==4:"Worker "+str(rank)+": Work completed and reset")
if settings["verbose"]:
print("Worker "+str(rank)+" ended.")
time.sleep(0.001) #to take some load off the node"Worker "+str(rank)+" ended.")
if rank ==0:"Controlling master ended.")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment