Skip to content

[SPARK-51272][CORE] Aborting instead of re-submitting of partially completed indeterminate result stage #50630

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1554,6 +1554,12 @@ private[spark] class DAGScheduler(
case sms: ShuffleMapStage if stage.isIndeterminate && !sms.isAvailable =>
mapOutputTracker.unregisterAllMapAndMergeOutput(sms.shuffleDep.shuffleId)
sms.shuffleDep.newShuffleMergeState()
case rs: ResultStage if rs.isIndeterminate &&
rs.findMissingPartitions().length != rs.partitions.length =>
abortStage(rs, "Re-submit of a partially completed indeterminate result stage is not " +
"supported", None)
runningStages -= stage
return
case _ =>
}

Expand Down
147 changes: 122 additions & 25 deletions core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
package org.apache.spark.scheduler

import java.util.{ArrayList => JArrayList, Collections => JCollections, Properties}
import java.util.concurrent.{CountDownLatch, Delayed, ScheduledFuture, TimeUnit}
import java.util.concurrent.{CountDownLatch, Delayed, LinkedBlockingQueue, ScheduledFuture, TimeUnit}
import java.util.concurrent.atomic.{AtomicBoolean, AtomicLong, AtomicReference}

import scala.annotation.meta.param
import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, ListBuffer, Map}
import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Map}
import scala.jdk.CollectionConverters._
import scala.language.reflectiveCalls
import scala.util.control.NonFatal
Expand Down Expand Up @@ -56,28 +56,31 @@ class DAGSchedulerEventProcessLoopTester(dagScheduler: DAGScheduler)

dagScheduler.setEventProcessLoop(this)

private var isProcessing = false
private val eventQueue = new ListBuffer[DAGSchedulerEvent]()

private val eventQueue = new LinkedBlockingQueue[DAGSchedulerEvent]()

override def post(event: DAGSchedulerEvent): Unit = {
if (isProcessing) {
// `DAGSchedulerEventProcessLoop` is guaranteed to process events sequentially. So we should
// buffer events for sequent processing later instead of processing them recursively.
eventQueue += event
} else {
try {
isProcessing = true
// Forward event to `onReceive` directly to avoid processing event asynchronously.
onReceive(event)
} catch {
case NonFatal(e) => onError(e)
} finally {
isProcessing = false
}
if (eventQueue.nonEmpty) {
post(eventQueue.remove(0))
}
// `DAGSchedulerEventProcessLoop` is guaranteed to process events sequentially in the main test
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This part is modified as I have seen the following in the unit-tests.log without it (focus on the thread names: pool-1-thread-1-ScalaTest-running-DAGSchedulerSuite, dag-scheduler-message):

25/04/17 14:15:05.815 pool-1-thread-1-ScalaTest-running-DAGSchedulerSuite INFO SparkContext: Created broadcast 2 from broadcast at DAGScheduler.scala:1662
25/04/17 14:15:05.815 pool-1-thread-1-ScalaTest-running-DAGSchedulerSuite INFO DAGSchedulerSuite$MyDAGScheduler: Submitting 2 missing tasks from ResultStage 2 (DAGSchedulerSuiteRDD 2) (first 15 tasks are for partitions Vector(0, 1))
25/04/17 14:15:05.816 pool-1-thread-1-ScalaTest-running-DAGSchedulerSuite INFO DAGSchedulerSuite$MyDAGScheduler: Marking ResultStage 2 () as failed due to a fetch failure from ShuffleMapStage 1 (RDD at DAGSchedulerSuite.scala:123)
25/04/17 14:15:05.817 pool-1-thread-1-ScalaTest-running-DAGSchedulerSuite INFO DAGSchedulerSuite$MyDAGScheduler: ResultStage 2 () failed in 3 ms due to ignored
25/04/17 14:15:05.817 pool-1-thread-1-ScalaTest-running-DAGSchedulerSuite INFO DAGSchedulerSuite$MyDAGScheduler: Resubmitting ShuffleMapStage 1 (RDD at DAGSchedulerSuite.scala:123) and ResultStage 2 () due to fetch failure
25/04/17 14:15:05.817 pool-1-thread-1-ScalaTest-running-DAGSchedulerSuite INFO DAGSchedulerSuite$MyDAGScheduler: Executor lost: hostA-exec (epoch 3)
25/04/17 14:15:05.818 pool-1-thread-1-ScalaTest-running-DAGSchedulerSuite INFO DAGSchedulerSuite$MyDAGScheduler: Shuffle files lost for executor: hostA-exec (epoch 3)
25/04/17 14:15:06.023 dag-scheduler-message INFO DAGSchedulerSuite$MyDAGScheduler: Resubmitting failed stages
25/04/17 14:15:06.024 dag-scheduler-message INFO DAGSchedulerSuite$MyDAGScheduler: Submitting ShuffleMapStage 1 (DAGSchedulerSuiteRDD 0), which has no missing parents
25/04/17 14:15:06.025 dag-scheduler-message INFO MemoryStore: Block broadcast_3 stored as values in memory (estimated size 2.9 KiB, free 2.4 GiB)
25/04/17 14:15:06.025 dag-scheduler-message INFO MemoryStore: Block broadcast_3_piece0 stored as bytes in memory (estimated size 1825.0 B, free 2.4 GiB)
25/04/17 14:15:06.026 dag-scheduler-message INFO SparkContext: Created broadcast 3 from broadcast at DAGScheduler.scala:1662
25/04/17 14:15:06.027 dag-scheduler-message INFO DAGSchedulerSuite$MyDAGScheduler: Submitting 1 missing tasks from ShuffleMapStage 1 (DAGSchedulerSuiteRDD 0) (first 15 tasks are for partitions Vector(0))
25/04/17 14:15:06.028 dag-scheduler-message INFO DAGSchedulerSuite$MyDAGScheduler: Submitting ShuffleMapStage 0 (DAGSchedulerSuiteRDD 1), which has no missing parents
25/04/17 14:15:06.029 dag-scheduler-message INFO MemoryStore: Block broadcast_4 stored as values in memory (estimated size 2.9 KiB, free 2.4 GiB)
25/04/17 14:15:06.029 dag-scheduler-message INFO MemoryStore: Block broadcast_4_piece0 stored as bytes in memory (estimated size 1826.0 B, free 2.4 GiB)
25/04/17 14:15:06.030 dag-scheduler-message INFO SparkContext: Created broadcast 4 from broadcast at DAGScheduler.scala:1662
25/04/17 14:15:06.030 dag-scheduler-message INFO DAGSchedulerSuite$MyDAGScheduler: Submitting 2 missing tasks from ShuffleMapStage 0 (DAGSchedulerSuiteRDD 1) (first 15 tasks are for partitions Vector(0, 1))
25/04/17 14:15:06.227 pool-1-thread-1-ScalaTest-running-DAGSchedulerSuite INFO DAGSchedulerSuite$MyDAGScheduler: ShuffleMapStage 0 (RDD at DAGSchedulerSuite.scala:123) finished in 198 ms
25/04/17 14:15:06.228 pool-1-thread-1-ScalaTest-running-DAGSchedulerSuite INFO DAGSchedulerSuite$MyDAGScheduler: looking for newly runnable stages
25/04/17 14:15:06.228 pool-1-thread-1-ScalaTest-running-DAGSchedulerSuite INFO DAGSchedulerSuite$MyDAGScheduler: running: HashSet(ShuffleMapStage 1)
25/04/17 14:15:06.229 pool-1-thread-1-ScalaTest-running-DAGSchedulerSuite INFO DAGSchedulerSuite$MyDAGScheduler: waiting: HashSet(ResultStage 2)
25/04/17 14:15:06.229 pool-1-thread-1-ScalaTest-running-DAGSchedulerSuite INFO DAGSchedulerSuite$MyDAGScheduler: failed: HashSet()
25/04/17 14:15:06.233 pool-1-thread-1-ScalaTest-running-DAGSchedulerSuite INFO DAGSchedulerSuite$MyDAGScheduler: ShuffleMapStage 1 (RDD at DAGSchedulerSuite.scala:123) finished in 209 ms

// thread similarly as it is done in production using the "dag-scheduler-event-loop".
// So we should buffer events for sequent processing later instead of executing them
// on thread calling post() (which might be the "dag-scheduler-message" thread for some
// events posted by the DAGScheduler itself)
eventQueue.put(event)
}

def runEvents(): Unit = {
var dagEvent = eventQueue.poll()
while (dagEvent != null) {
onReciveWithErrorHandler(dagEvent)
dagEvent = eventQueue.poll()
}
}

private def onReciveWithErrorHandler(event: DAGSchedulerEvent): Unit = {
try {
// Forward event to `onReceive` directly to avoid processing event asynchronously.
onReceive(event)
} catch {
case NonFatal(e) => onError(e)
}
}

Expand Down Expand Up @@ -306,7 +309,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
var broadcastManager: BroadcastManager = null
var securityMgr: SecurityManager = null
var scheduler: DAGScheduler = null
var dagEventProcessLoopTester: DAGSchedulerEventProcessLoop = null
var dagEventProcessLoopTester: DAGSchedulerEventProcessLoopTester = null

/**
* Set of cache locations to return from our mock BlockManagerMaster.
Expand Down Expand Up @@ -479,6 +482,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
// Ensure the initialization of various components
sc
dagEventProcessLoopTester.post(event)
dagEventProcessLoopTester.runEvents()
}

/**
Expand Down Expand Up @@ -1190,11 +1194,12 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
private def completeNextStageWithFetchFailure(
stageId: Int,
attemptIdx: Int,
shuffleDep: ShuffleDependency[_, _, _]): Unit = {
shuffleDep: ShuffleDependency[_, _, _],
srcHost: String = "hostA"): Unit = {
val stageAttempt = taskSets.last
checkStageId(stageId, attemptIdx, stageAttempt)
complete(stageAttempt, stageAttempt.tasks.zipWithIndex.map { case (task, idx) =>
(FetchFailed(makeBlockManagerId("hostA"), shuffleDep.shuffleId, 0L, 0, idx, "ignored"), null)
(FetchFailed(makeBlockManagerId(srcHost), shuffleDep.shuffleId, 0L, 0, idx, "ignored"), null)
}.toSeq)
}

Expand Down Expand Up @@ -2251,6 +2256,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
assert(completedStage === List(0, 1))

Thread.sleep(DAGScheduler.RESUBMIT_TIMEOUT * 2)
dagEventProcessLoopTester.runEvents()
// map stage resubmitted
assert(scheduler.runningStages.size === 1)
val mapStage = scheduler.runningStages.head
Expand Down Expand Up @@ -2286,6 +2292,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
sc.listenerBus.waitUntilEmpty()

Thread.sleep(DAGScheduler.RESUBMIT_TIMEOUT * 2)
dagEventProcessLoopTester.runEvents()
// map stage is running by resubmitted, result stage is waiting
// map tasks and the origin result task 1.0 are running
assert(scheduler.runningStages.size == 1, "Map stage should be running")
Expand Down Expand Up @@ -3125,6 +3132,92 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
assert(countSubmittedMapStageAttempts() === 2)
}

/**
* This function creates the following dependency graph:
*
* (determinate) (indeterminate)
* shuffleMapRdd0 shuffleMapRDD1
* \ /
* \ /
* finalRdd
*
* Both ShuffleMapRdds will be ShuffleMapStages with 2 partitions executed on
* hostA_exec and hostB_exec.
*/
def constructMixedDeterminateDependencies():
(ShuffleDependency[_, _, _], ShuffleDependency[_, _, _]) = {
val numPartitions = 2
val shuffleMapRdd0 = new MyRDD(sc, numPartitions, Nil, indeterminate = false)

val shuffleDep0 = new ShuffleDependency(shuffleMapRdd0, new HashPartitioner(2))
val shuffleId0 = shuffleDep0.shuffleId
val shuffleMapRdd1 =
new MyRDD(sc, numPartitions, Nil, tracker = mapOutputTracker, indeterminate = true)

val shuffleDep1 = new ShuffleDependency(shuffleMapRdd1, new HashPartitioner(2))
val shuffleId1 = shuffleDep1.shuffleId
val finalRdd =
new MyRDD(sc, numPartitions, List(shuffleDep0, shuffleDep1), tracker = mapOutputTracker)

submit(finalRdd, Array(0, 1))
val stageId0 = this.scheduler.shuffleIdToMapStage(shuffleId0).id

// Finish the first shuffle map stage.
completeShuffleMapStageSuccessfully(0, 0, numPartitions, Seq("hostA", "hostB"))
completeShuffleMapStageSuccessfully(1, 0, numPartitions, Seq("hostA", "hostB"))
assert(mapOutputTracker.findMissingPartitions(0) === Some(Seq.empty))
assert(mapOutputTracker.findMissingPartitions(1) === Some(Seq.empty))

(shuffleDep0, shuffleDep1)
}

test("SPARK-51272: re-submit of an indeterminate stage whithout partial result can succeed") {
val shuffleDeps = constructMixedDeterminateDependencies()
val resultStage = scheduler.stageIdToStage(2).asInstanceOf[ResultStage]

// the fetch failure is from the determinate shuffle map stage but this leads to
// executor lost and removing the shuffle files generated by the indeterminate stage too
completeNextStageWithFetchFailure(resultStage.id, 0, shuffleDeps._1, "hostA")

Thread.sleep(DAGScheduler.RESUBMIT_TIMEOUT * 2)
dagEventProcessLoopTester.runEvents()
assert(scheduler.runningStages.size === 2)
assert(scheduler.runningStages.forall(_.isInstanceOf[ShuffleMapStage]))

completeShuffleMapStageSuccessfully(0, 1, 2, Seq("hostA", "hostB"))
completeShuffleMapStageSuccessfully(1, 1, 2, Seq("hostA", "hostB"))
assert(scheduler.runningStages.size === 1)
assert(scheduler.runningStages.head === resultStage)
assert(resultStage.latestInfo.failureReason.isEmpty)

completeNextResultStageWithSuccess(resultStage.id, 1)
}

test("SPARK-51272: re-submit of an indeterminate stage whith partial result will fail") {
val shuffleDeps = constructMixedDeterminateDependencies()
val resultStage = scheduler.stageIdToStage(2).asInstanceOf[ResultStage]

runEvent(makeCompletionEvent(taskSets(2).tasks(0), Success, 42))
// the fetch failure is from the determinate shuffle map stage but this leads to
// executor lost and removing the shuffle files generated by the indeterminate stage too
runEvent(makeCompletionEvent(
taskSets(2).tasks(1),
FetchFailed(makeBlockManagerId("hostA"), shuffleDeps._1.shuffleId, 0L, 0, 0, "ignored"),
null))

Thread.sleep(DAGScheduler.RESUBMIT_TIMEOUT * 2)
dagEventProcessLoopTester.runEvents()
assert(scheduler.runningStages.size === 2)
assert(scheduler.runningStages.forall(_.isInstanceOf[ShuffleMapStage]))

completeShuffleMapStageSuccessfully(0, 1, 2, Seq("hostA", "hostB"))
completeShuffleMapStageSuccessfully(1, 1, 2, Seq("hostA", "hostB"))
assert(scheduler.runningStages.size === 0)
assert(resultStage.latestInfo.failureReason.isDefined)
assert(resultStage.latestInfo.failureReason.get === "Job aborted due to stage failure: " +
"Re-submit of a partially completed indeterminate result stage is not supported")
}

private def constructIndeterminateStageFetchFailed(): (Int, Int) = {
val shuffleMapRdd1 = new MyRDD(sc, 2, Nil, indeterminate = true)

Expand Down Expand Up @@ -4884,6 +4977,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
// wait resubmit
sc.listenerBus.waitUntilEmpty()
Thread.sleep(DAGScheduler.RESUBMIT_TIMEOUT * 2)
dagEventProcessLoopTester.runEvents()

// stage0 retry
val stage0Retry = taskSets.filter(_.stageId == 1)
Expand Down Expand Up @@ -4984,6 +5078,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti

// the stages will now get resubmitted due to the failure
Thread.sleep(DAGScheduler.RESUBMIT_TIMEOUT * 2)
dagEventProcessLoopTester.runEvents()

// parent map stage resubmitted
assert(scheduler.runningStages.size === 1)
Expand All @@ -5003,6 +5098,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
result = MapStatus(BlockManagerId("hostF-exec1", "hostF", 12345),
Array.fill[Long](2)(2), mapTaskId = taskIdCount)))
Thread.sleep(DAGScheduler.RESUBMIT_TIMEOUT * 2)
dagEventProcessLoopTester.runEvents()

// The retries should succeed
sc.listenerBus.waitUntilEmpty()
Expand All @@ -5012,6 +5108,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
// This will add 3 new stages.
submit(reduceRdd, Array(0, 1))
Thread.sleep(DAGScheduler.RESUBMIT_TIMEOUT * 2)
dagEventProcessLoopTester.runEvents()

// Only the last stage needs to execute, and those tasks - so completed stages should not
// change.
Expand Down