Details
-
Type:
Story
-
Status: Won't Fix
-
Resolution: Done
-
Fix Version/s: None
-
Component/s: None
-
Labels:
-
Team:Telescope and Site
-
Urgent?:No
Description
The ESS should detect failures in data clients and go to fault (possibly after retrying?).
The CSC already appears to do this. See this:
async def run_data_clients(self) -> None:
"""Run the data clients, to read and publish environmental data."""
tasks = [client.run_task for client in self.data_clients]
try:
self.run_data_clients_task = asyncio.gather(*tasks)
await self.run_data_clients_task
except Exception as main_exception:
self.log.exception(f"run_data_clients failed: {main_exception!r}")
index, task_exception = get_task_index_exception(tasks)
traceback_arg = None
if index is None:
code = ErrorCode.RunFailed
report = (
"run_data_clients failed, but no run task failed; "
f"please report as a bug: {main_exception}"
)
traceback_arg = traceback.format_exc()
else:
client = self.data_clients[index]
if any(
isinstance(task_exception, etype)
for etype in (ConnectionError, asyncio.IncompleteReadError)
):
code = ErrorCode.ConnectionLost
report = (
f"{client} lost connection to its data server: {task_exception}"
)
elif isinstance(task_exception, asyncio.TimeoutError):
code = ErrorCode.ConnectionLost
report = f"{client} timed out waiting for data"
else:
code = ErrorCode.RunFailed
report = f"{client} failed while running: {task_exception!r}"
await self.fault(code=code, report=report, traceback=traceback_arg)
raise