Commit 3fbc2355 authored by Alexandre Boyer's avatar Alexandre Boyer
Browse files

fix monitor: use a thread to share boolean attribute indicating end of main func

parent 942ec98b
Pipeline #130699 passed with stage
in 30 seconds
......@@ -16,14 +16,13 @@ import datetime, time
import logging
import signal
from multiprocessing import *
from threading import Thread
from diva_evaluation_cli.bin.private_src.implementation.resources_monitoring.nvml_handler import NvmlHandler
from diva_evaluation_cli.bin.private_src.implementation.resources_monitoring import utils
logger = logging.getLogger('MONITOR')
class Monitor():
"""Class that implements a monitor
......@@ -40,7 +39,7 @@ class Monitor():
"""
def __init__(self, main_function, args, command_name='default_command', interval=30):
def __init__(self, main_function, args, command_name='default_command', monitor_interval=5, log_interval=30):
"""
Args:
main_function (function): Python function that will be run as a subprocess and monitored
......@@ -52,11 +51,12 @@ class Monitor():
self.command_name = command_name
self.main_function = main_function
self.args = args
self.interval = interval
self.monitor_interval = monitor_interval
self.log_interval = log_interval
log_file_path = os.path.dirname(__file__)
self.log_file = os.path.join(log_file_path, './resources_monitoring.json')
self.main_function_is_running = False
def run_monitor(self):
......@@ -66,13 +66,14 @@ class Monitor():
`monitoring_process()` function is in a process.
"""
self.main_process_pid = os.getpid()
self.main_function(**self.args.__dict__)
self.monitor = Process(target=self.monitor_resources)
self.monitor = Thread(target=self.monitor_resources)
self.monitor.start()
try:
self.main_function_is_running = True
self.main_function(**self.args.__dict__)
finally:
self.main_function_is_running = False
def increment_log_file(self, log_dict):
"""Increment the file pointed by the `log_file` attribute with a new dict
......@@ -84,7 +85,6 @@ class Monitor():
log_dict (:obj:`dict`): A new dictionary to increment the existing log_file
"""
with open(self.log_file, 'a') as f:
f.write(json.dumps(log_dict) + ',\n')
......@@ -100,27 +100,33 @@ class Monitor():
# Try to retrieve the main_process
try:
process = psutil.Process(self.main_process_pid)
logger.debug('MAIN PID: ', self.main_process_pid)
logger.debug('MONITOR PID: ', os.getpid())
logger.debug('MAIN PID: %d' % self.main_process_pid)
logger.debug('MONITOR PID: %d' % os.getpid())
log_time_counter = 0
# Load the NVIDIA handler
with NvmlHandler() as nvml_h:
while process.is_running() and process.status() != psutil.STATUS_ZOMBIE and process.status() != psutil.STATUS_SLEEPING:
# Gather resources use from psutil
resources_use = utils.psutil_snapshot()
while process.is_running() and process.status() != psutil.STATUS_ZOMBIE and self.main_function_is_running:
if log_time_counter >= self.log_interval:
# Gather resources use from psutil
resources_use = utils.psutil_snapshot()
# Get status from the GPUs
resources_use.update({'gpus':nvml_h.get_devices_status()})
# Get status from the GPUs
resources_use.update({'gpus':nvml_h.get_devices_status()})
# Get the command that's currently run
resources_use.update({'command_line':self.command_name})
# Get the command that's currently run
resources_use.update({'command_line':self.command_name})
# Update the timestamp
resources_use.update({'timestamp':datetime.datetime.now().isoformat()})
# Update the timestamp
resources_use.update({'timestamp':datetime.datetime.now().isoformat()})
# Increment the logs file
self.increment_log_file(resources_use)
# Increment the logs file
self.increment_log_file(resources_use)
# Reset log_time_counter
log_time_counter = 0
time.sleep(self.interval)
log_time_counter += self.monitor_interval
time.sleep(self.monitor_interval)
except Exception as e:
logger.debug(e)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment