91超碰碰碰碰久久久久久综合_超碰av人澡人澡人澡人澡人掠_国产黄大片在线观看画质优化_txt小说免费全本

溫馨提示×

溫馨提示×

您好,登錄后才能下訂單哦!

密碼登錄×
登錄注冊×
其他方式登錄
點擊 登錄注冊 即表示同意《億速云用戶服務條款》

提交stage

發布時間:2020-06-13 04:35:44 來源:網絡 閱讀:455 作者:惡魔蘇醒ing 欄目:大數據

  //提交stage,為stage創建一批task,task數量和partition數量相同

  private def submitMissingTasks(stage: Stage, jobId: Int) {

    logDebug("submitMissingTasks(" + stage + ")")

    // Get our pending tasks and remember them in our pendingTasks entry

    stage.pendingTasks.clear()


    // First figure out the indexes of partition ids to compute.

//獲取要創建的task的數量

    val partitionsToCompute: Seq[Int] = {

      if (stage.isShuffleMap) {

        (0 until stage.numPartitions).filter(id => stage.outputLocs(id) == Nil)

      } else {

        val job = stage.resultOfJob.get

        (0 until job.numPartitions).filter(id => !job.finished(id))

      }

    }


    val properties = if (jobIdToActiveJob.contains(jobId)) {

      jobIdToActiveJob(stage.jobId).properties

    } else {

      // this stage will be assigned to "default" pool

      null

    }

//將stage加入runningstage隊列

    runningStages += stage

    // SparkListenerStageSubmitted should be posted before testing whether tasks are

    // serializable. If tasks are not serializable, a SparkListenerStageCompleted event

    // will be posted, which should always come after a corresponding SparkListenerStageSubmitted

    // event.

    stage.latestInfo = StageInfo.fromStage(stage, Some(partitionsToCompute.size))

    outputCommitCoordinator.stageStart(stage.id)

    listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))


    // TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.

    // Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast

    // the serialized copy of the RDD and for each task we will deserialize it, which means each

    // task gets a different copy of the RDD. This provides stronger isolation between tasks that

    // might modify state of objects referenced in their closures. This is necessary in Hadoop

    // where the JobConf/Configuration object is not thread-safe.

    var taskBinary: Broadcast[Array[Byte]] = null

    try {

      // For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).

      // For ResultTask, serialize and broadcast (rdd, func).

      val taskBinaryBytes: Array[Byte] =

        if (stage.isShuffleMap) {

          closureSerializer.serialize((stage.rdd, stage.shuffleDep.get) : AnyRef).array()

        } else {

          closureSerializer.serialize((stage.rdd, stage.resultOfJob.get.func) : AnyRef).array()

        }

      taskBinary = sc.broadcast(taskBinaryBytes)

    } catch {

      // In the case of a failure during serialization, abort the stage.

      case e: NotSerializableException =>

        abortStage(stage, "Task not serializable: " + e.toString)

        runningStages -= stage

        return

      case NonFatal(e) =>

        abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}")

        runningStages -= stage

        return

    }

//為stage創建指定數量的task

    val tasks: Seq[Task[_]] = if (stage.isShuffleMap) {

      partitionsToCompute.map { id =>

//給每個partition創建一個task

//給每個task計算最佳位置

        val locs = getPreferredLocs(stage.rdd, id)

        val part = stage.rdd.partitions(id)

//對于finalstage之外的stage的isShuffleMap都是true

//所以會創建ShuffleMapTask

        new ShuffleMapTask(stage.id, taskBinary, part, locs)

      }

    } else {

//如果不是ShuffleMap,就會創建finalstage

//finalstage是穿件resultTask

      val job = stage.resultOfJob.get

      partitionsToCompute.map { id =>

        val p: Int = job.partitions(id)

        val part = stage.rdd.partitions(p)

//獲取task計算的最佳位置的方法 getPreferredLocs

        val locs = getPreferredLocs(stage.rdd, p)

        new ResultTask(stage.id, taskBinary, part, locs, id)

      }

    }


    if (tasks.size > 0) {

      logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")

      stage.pendingTasks ++= tasks

      logDebug("New pending tasks: " + stage.pendingTasks)

      taskScheduler.submitTasks(

        new TaskSet(tasks.toArray, stage.id, stage.newAttemptId(), stage.jobId, properties))

      stage.latestInfo.submissionTime = Some(clock.getTimeMillis())

    } else {

      // Because we posted SparkListenerStageSubmitted earlier, we should post

      // SparkListenerStageCompleted here in case there are no tasks to run.

      outputCommitCoordinator.stageEnd(stage.id)

      listenerBus.post(SparkListenerStageCompleted(stage.latestInfo))

      logDebug("Stage " + stage + " is actually done; %b %d %d".format(

        stage.isAvailable, stage.numAvailableOutputs, stage.numPartitions))

      runningStages -= stage

    }

  }




  def getPreferredLocs(rdd: RDD[_], partition: Int): Seq[TaskLocation] = {

    getPreferredLocsInternal(rdd, partition, new HashSet)

  }

//task對應partition的最佳位置

//就是從stage的最后一個RDD開始,找哪個RDD是被持久化了或者checkpoint

//那么task的最佳位置就是緩存的/checkpoint 的 partition的位置

//因為這樣的話,task就在那個節點上執行,不需要計算之前的RDD

  private def getPreferredLocsInternal(

      rdd: RDD[_],

      partition: Int,

      visited: HashSet[(RDD[_],Int)])

    : Seq[TaskLocation] =

  {

    // If the partition has already been visited, no need to re-visit.

    // This avoids exponential path exploration.  SPARK-695

    if (!visited.add((rdd,partition))) {

      // Nil has already been returned for previously visited partitions.

      return Nil

    }

    // If the partition is cached, return the cache locations

//尋找當前RDD是否緩存了

    val cached = getCacheLocs(rdd)(partition)

    if (!cached.isEmpty) {

      return cached

    }

    // If the RDD has some placement preferences (as is the case for input RDDs), get those

//尋找當前RDD是否checkpoint了

    val rddPrefs = rdd.preferredLocations(rdd.partitions(partition)).toList

    if (!rddPrefs.isEmpty) {

      return rddPrefs.map(TaskLocation(_))

    }

    // If the RDD has narrow dependencies, pick the first partition of the first narrow dep

    // that has any placement preferences. Ideally we would choose based on transfer sizes,

    // but this will do for now.

//遞歸調用,看看父RDD是否緩存或者checkpoint

    rdd.dependencies.foreach {

      case n: NarrowDependency[_] =>

        for (inPart <- n.getParents(partition)) {

          val locs = getPreferredLocsInternal(n.rdd, inPart, visited)

          if (locs != Nil) {

            return locs

          }

        }

      case _ =>

    }

//如果從第一個RDD到最后一個RDD都沒有緩存或者checkpoint,那最佳位置就是Nil,也就是沒有最佳位置

//那他的位置就要由taskscheduler來分配

    Nil

  }


向AI問一下細節

免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。

AI

丹阳市| 保定市| 福清市| 荣昌县| 利川市| 东乡县| 固始县| 苍南县| 泗阳县| 武安市| 柳林县| 阿图什市| 肃北| 平塘县| 洪湖市| 海淀区| 茂名市| 宁晋县| 门头沟区| 海宁市| 庆阳市| 化德县| 霍山县| 雷山县| 甘肃省| 济源市| 商城县| 富源县| 会宁县| 英超| 阿坝| 天柱县| 巫溪县| 灌阳县| 都江堰市| 宝鸡市| 元谋县| 景洪市| 洱源县| 昂仁县| 响水县|