flashinfer-ai · zobinHuang · Apr 1, 2025 · Apr 2, 2025 · Apr 2, 2025
diff --git a/flashinfer/jit/core.py b/flashinfer/jit/core.py
@@ -1,13 +1,14 @@
 import logging
 import os
 import re
+import shutil
 from contextlib import suppress
 from pathlib import Path
 from typing import List, Optional, Union
 
 import torch
 import torch.utils.cpp_extension as torch_cpp_ext
-from filelock import FileLock
+from filelock import FileLock, Timeout
 
 from .env import CUTLASS_INCLUDE_DIRS as CUTLASS_INCLUDE_DIRS
 from .env import FLASHINFER_CSRC_DIR as FLASHINFER_CSRC_DIR
@@ -80,6 +81,14 @@ def remove_unwanted_pytorch_nvcc_flags():
 sm90a_nvcc_flags = ["-gencode", "arch=compute_90a,code=sm_90a"]
 
 
+# cleanup compiled ops and filelock of a given name
+def cleanup_compiled_ops(name: str):
+    if os.path.exists(FLASHINFER_JIT_DIR / f"{name}.lock"):
+        os.remove(FLASHINFER_JIT_DIR / f"{name}.lock")
+    if os.path.exists(FLASHINFER_JIT_DIR / name):
+        shutil.rmtree(FLASHINFER_JIT_DIR / name)
+
+
 def load_cuda_ops(
     name: str,
     sources: List[Union[str, Path]],
@@ -127,21 +136,25 @@ def load_cuda_ops(
         FLASHINFER_INCLUDE_DIR,
         FLASHINFER_CSRC_DIR,
     ] + CUTLASS_INCLUDE_DIRS
-    lock = FileLock(FLASHINFER_JIT_DIR / f"{name}.lock", thread_local=False)
-    with lock:
-        torch_cpp_ext.load(
-            name,
-            list(map(lambda _: str(_), sources)),
-            extra_cflags=cflags,
-            extra_cuda_cflags=cuda_cflags,
-            extra_ldflags=extra_ldflags,
-            extra_include_paths=list(map(lambda _: str(_), extra_include_paths)),
-            build_directory=build_directory,
-            verbose=verbose,
-            with_cuda=True,
-            # We switched to torch.library, so will be loaded into torch.ops
-            # instead of into a separate module.
-            is_python_module=False,
-        )
+    lock = FileLock(FLASHINFER_JIT_DIR / f"{name}.lock", timeout=30, thread_local=False)
+    try:
+        with lock:
+            torch_cpp_ext.load(
+                name,
+                list(map(lambda _: str(_), sources)),
+                extra_cflags=cflags,
+                extra_cuda_cflags=cuda_cflags,
+                extra_ldflags=extra_ldflags,
+                extra_include_paths=list(map(lambda _: str(_), extra_include_paths)),
+                build_directory=build_directory,
+                verbose=verbose,
+                with_cuda=True,
+                # We switched to torch.library, so will be loaded into torch.ops
+                # instead of into a separate module.
+                is_python_module=False,
+            )
+    except Timeout:
+        raise RuntimeError(f"failed to require JIT filelock for JIT ops {name}")
+
     logger.info(f"Finished loading JIT ops: {name}")
     return getattr(torch.ops, name)
diff --git a/tests/test_jit_filelock.py b/tests/test_jit_filelock.py
@@ -0,0 +1,35 @@
+"""
+Copyright (c) 2024 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import logging
+
+import pytest
+import torch
+import torch.multiprocessing as mp
+
+from flashinfer.jit.core import FLASHINFER_JIT_DIR, cleanup_compiled_ops
+from flashinfer.sampling import get_sampling_module
+
+
+def _compile_sampling_kernel(rank: int):
+    get_sampling_module()
+
+
+def test_multiprocess_jit_compile_same_kernel():
+    # print pid
+    cleanup_compiled_ops("sampling")
+    # create 4 processes, each process should compile the same kernel
+    mp.spawn(fn=_compile_sampling_kernel, args=(), nprocs=4, join=True)