Skip to content

Commit 45bfddd

Browse files
committed
TRY fix fault inject
1 parent f0e8730 commit 45bfddd

File tree

16 files changed

+266
-111
lines changed

16 files changed

+266
-111
lines changed

mars/deploy/oscar/tests/test_cmdline.py

+34-34
Original file line numberDiff line numberDiff line change
@@ -111,38 +111,6 @@ def _get_labelled_port(label=None, create=True):
111111

112112
supervisor_cmd_start = [sys.executable, "-m", "mars.deploy.oscar.supervisor"]
113113
worker_cmd_start = [sys.executable, "-m", "mars.deploy.oscar.worker"]
114-
start_params = {
115-
"bare_start": [
116-
supervisor_cmd_start,
117-
worker_cmd_start
118-
+ [
119-
"--config-file",
120-
os.path.join(os.path.dirname(__file__), "local_test_config.yml"),
121-
],
122-
False,
123-
],
124-
"with_supervisors": [
125-
supervisor_cmd_start
126-
+ [
127-
"-e",
128-
lambda: f'127.0.0.1:{_get_labelled_port("supervisor")}',
129-
"-w",
130-
lambda: str(_get_labelled_port("web")),
131-
"--n-process",
132-
"2",
133-
],
134-
worker_cmd_start
135-
+ [
136-
"-e",
137-
lambda: f"127.0.0.1:{get_next_port(occupy=True)}",
138-
"-s",
139-
lambda: f'127.0.0.1:{_get_labelled_port("supervisor")}',
140-
"--config-file",
141-
os.path.join(os.path.dirname(__file__), "local_test_config.yml"),
142-
],
143-
True,
144-
],
145-
}
146114

147115

148116
def _reload_args(args):
@@ -159,8 +127,40 @@ def _reload_args(args):
159127

160128
@pytest.mark.parametrize(
161129
"supervisor_args,worker_args,use_web_addr",
162-
list(start_params.values()),
163-
ids=list(start_params.keys()),
130+
[
131+
pytest.param(
132+
supervisor_cmd_start,
133+
worker_cmd_start
134+
+ [
135+
"--config-file",
136+
os.path.join(os.path.dirname(__file__), "local_test_config.yml"),
137+
],
138+
False,
139+
id="bare_start",
140+
),
141+
pytest.param(
142+
supervisor_cmd_start
143+
+ [
144+
"-e",
145+
lambda: f'127.0.0.1:{_get_labelled_port("supervisor")}',
146+
"-w",
147+
lambda: str(_get_labelled_port("web")),
148+
"--n-process",
149+
"2",
150+
],
151+
worker_cmd_start
152+
+ [
153+
"-e",
154+
lambda: f"127.0.0.1:{get_next_port(occupy=True)}",
155+
"-s",
156+
lambda: f'127.0.0.1:{_get_labelled_port("supervisor")}',
157+
"--config-file",
158+
os.path.join(os.path.dirname(__file__), "local_test_config.yml"),
159+
],
160+
True,
161+
id="with_supervisors",
162+
),
163+
],
164164
)
165165
@flaky(max_runs=10, rerun_filter=lambda err, *_: issubclass(err[0], _rerun_errors))
166166
def test_cmdline_run(supervisor_args, worker_args, use_web_addr):

mars/services/scheduling/api/oscar.py

-14
Original file line numberDiff line numberDiff line change
@@ -117,20 +117,6 @@ async def cancel_subtasks(
117117
"""
118118
await self._manager_ref.cancel_subtasks(subtask_ids, kill_timeout=kill_timeout)
119119

120-
async def finish_subtasks(self, subtask_ids: List[str], schedule_next: bool = True):
121-
"""
122-
Mark subtasks as finished, letting scheduling service to schedule
123-
next tasks in the ready queue
124-
125-
Parameters
126-
----------
127-
subtask_ids
128-
ids of subtasks to mark as finished
129-
schedule_next
130-
whether to schedule succeeding subtasks
131-
"""
132-
await self._manager_ref.finish_subtasks(subtask_ids, schedule_next)
133-
134120

135121
class MockSchedulingAPI(SchedulingAPI):
136122
@classmethod

mars/services/scheduling/supervisor/manager.py

+36-11
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,7 @@ async def __post_create__(self):
9595
AssignerActor.gen_uid(self._session_id), address=self.address
9696
)
9797

98-
@alru_cache
99-
async def _get_task_api(self):
98+
async def _get_task_api(self) -> TaskAPI:
10099
return await TaskAPI.create(self._session_id, self.address)
101100

102101
def _put_subtask_with_priority(self, subtask: Subtask, priority: Tuple = None):
@@ -272,21 +271,47 @@ async def update_subtask_priorities(
272271

273272
@alru_cache(maxsize=10000)
274273
async def _get_execution_ref(self, address: str):
275-
from ..worker.exec import SubtaskExecutionActor
274+
from ..worker.execution import SubtaskExecutionActor
276275

277276
return await mo.actor_ref(SubtaskExecutionActor.default_uid(), address=address)
278277

279-
async def finish_subtasks(self, subtask_ids: List[str], schedule_next: bool = True):
280-
band_tasks = defaultdict(lambda: 0)
281-
for subtask_id in subtask_ids:
282-
subtask_info = self._subtask_infos.pop(subtask_id, None)
278+
async def set_subtask_results(
279+
self, subtask_results: List[SubtaskResult], source_bands: List[BandType]
280+
):
281+
delays = []
282+
task_api = await self._get_task_api()
283+
for result, band in zip(subtask_results, source_bands):
284+
if result.status == SubtaskStatus.errored:
285+
subtask_info = self._subtask_infos.get(result.subtask_id)
286+
if (
287+
subtask_info is not None
288+
and subtask_info.subtask.retryable
289+
and subtask_info.num_reschedules < subtask_info.max_reschedules
290+
and isinstance(result.error, (MarsError, OSError))
291+
):
292+
subtask_info.num_reschedules += 1
293+
logger.warning(
294+
"Resubmit subtask %s at attempt %d",
295+
subtask_info.subtask.subtask_id,
296+
subtask_info.num_reschedules,
297+
)
298+
execution_ref = await self._get_execution_ref(band[0])
299+
await execution_ref.submit_subtasks.tell(
300+
[subtask_info.subtask],
301+
[subtask_info.priority],
302+
self.address,
303+
band[1],
304+
)
305+
continue
306+
307+
subtask_info = self._subtask_infos.pop(result.subtask_id, None)
283308
if subtask_info is not None:
284-
self._subtask_summaries[subtask_id] = subtask_info.to_summary(
309+
self._subtask_summaries[result.subtask_id] = subtask_info.to_summary(
285310
is_finished=True
286311
)
287-
if schedule_next:
288-
for band in subtask_info.submitted_bands:
289-
band_tasks[band] += 1
312+
delays.append(task_api.set_subtask_result.delay(result))
313+
314+
await task_api.set_subtask_result.batch(*delays)
290315

291316
def _get_subtasks_by_ids(self, subtask_ids: List[str]) -> List[Optional[Subtask]]:
292317
subtasks = []

mars/services/scheduling/tests/test_service.py

-3
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,6 @@ async def test_schedule_success(actor_pools):
171171
subtask.expect_bands = [(worker_pool.external_address, "numa-0")]
172172
await scheduling_api.add_subtasks([subtask], [(0,)])
173173
await task_manager_ref.wait_subtask_result(subtask.subtask_id)
174-
await scheduling_api.finish_subtasks([subtask.subtask_id])
175174

176175
result_key = next(subtask.chunk_graph.iter_indep(reverse=True)).key
177176
result = await storage_api.get(result_key)
@@ -197,7 +196,6 @@ def _remote_fun(secs):
197196

198197
async def _waiter_fun(subtask_id):
199198
await task_manager_ref.wait_subtask_result(subtask_id)
200-
await scheduling_api.finish_subtasks([subtask_id])
201199
finish_ids.append(subtask_id)
202200
finish_time.append(time.time())
203201

@@ -245,7 +243,6 @@ def _remote_fun(secs):
245243

246244
async def _waiter_fun(subtask_id):
247245
await task_manager_ref.wait_subtask_result(subtask_id)
248-
await scheduling_api.finish_subtasks([subtask_id])
249246

250247
subtasks = []
251248
wait_tasks = []

mars/services/scheduling/worker/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
from .exec import SubtaskExecutionActor
15+
from .execution import SubtaskExecutionActor
1616
from .queues import SubtaskExecutionQueueActor, SubtaskPrepareQueueActor
1717
from .quota import QuotaActor, MemQuotaActor, WorkerQuotaManagerActor
1818
from .service import SchedulingWorkerService

mars/services/scheduling/worker/exec/__init__.py renamed to mars/services/scheduling/worker/execution/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,5 @@
1313
# limitations under the License.
1414

1515
from .actor import SubtaskExecutionActor
16+
from .core import SubtaskExecutionInfo
1617
from .prepare import SubtaskPreparer

0 commit comments

Comments
 (0)