JuliaParallel · jpsamaroo · Dec 7, 2024 · Dec 11, 2024 · Sep 12, 2024 · Nov 14, 2024
diff --git a/Project.toml b/Project.toml
@@ -11,6 +11,7 @@ Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
+MetricsTracker = "9a9c6fec-044d-4a27-aa18-2b01ca4026eb"
 OnlineStats = "a15396b6-48d5-5d58-9928-6d29437db91e"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
@@ -33,6 +34,7 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 GraphViz = "f526b714-d49f-11e8-06ff-31ed36ee7ee0"
 JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
+JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
 
@@ -41,6 +43,7 @@ DistributionsExt = "Distributions"
 GraphVizExt = "GraphViz"
 GraphVizSimpleExt = "Colors"
 JSON3Ext = "JSON3"
+JuMPExt = "JuMP"
 PlotsExt = ["DataFrames", "Plots"]
 PythonExt = "PythonCall"
 
@@ -54,6 +57,7 @@ Distributions = "0.25"
 GraphViz = "0.2"
 Graphs = "1"
 JSON3 = "1"
+JuMP = "1"
 MacroTools = "0.5"
 MemPool = "0.4.11"
 OnlineStats = "1"

diff --git a/ext/JuMPExt.jl b/ext/JuMPExt.jl
@@ -0,0 +1,163 @@
+module JuMPExt
+
+if isdefined(Base, :get_extension)
+    using JuMP
+else
+    using ..JuMP
+end
+
+using Dagger
+using Dagger.Distributed
+import MetricsTracker as MT
+import Graphs: edges, nv, outdegree
+
+struct JuMPScheduler
+    optimizer
+    Z::Float64
+    JuMPScheduler(optimizer) = new(optimizer, 10)
+end
+function Dagger.datadeps_create_schedule(sched::JuMPScheduler, state, specs_tasks)
+    astate = state.alias_state
+    g, task_to_id = astate.g, astate.task_to_id
+    id_to_task = Dict(id => task for (task, id) in task_to_id)
+    ntasks = length(specs_tasks)
+    nprocs = length(state.all_procs)
+    id_to_proc = Dict(i => p for (i, p) in enumerate(state.all_procs))
+
+    # Estimate the time each task will take to execute on each processor,
+    # and the time it will take to transfer data between processors
+    task_times = zeros(UInt64, ntasks, nprocs)
+    xfer_times = zeros(Int, nprocs, nprocs)
+    lock(MT.GLOBAL_METRICS_CACHE) do cache
+        for (spec, task) in specs_tasks
+            id = task_to_id[task]
+            for p in 1:nprocs
+                # When searching for a task runtime estimate, we use whatever
+                # estimate is available and closest if not populated for this processor
+                # Exact match > same proc type, same node > same proc type, any node > any proc type
+
+                sig = Dagger.Sch.signature(spec.f, map(pos_arg->pos_arg[1] => Dagger.unwrap_inout_value(pos_arg[2]), spec.args))
+                proc = state.all_procs[p]
+                @warn "Use node, not worker id!" maxlog=1
+                pid = Dagger.root_worker_id(proc)
+
+                # Try exact match
+                match_on = (MT.LookupExact(Dagger.SignatureMetric(), sig),
+                            MT.LookupExact(Dagger.ProcessorMetric(), proc))
+                result = MT.cache_lookup(cache, Dagger, :execute!, MT.TimeMetric(), match_on)::Union{UInt64, Nothing}
+                if result !== nothing
+                    task_times[id, p] = result
+                    continue
+                end
+
+                # Try same proc type, same node
+                match_on = (MT.LookupExact(Dagger.SignatureMetric(), sig),
+                            MT.LookupSubtype(Dagger.ProcessorMetric(), typeof(proc)),
+                            MT.LookupCustom(Dagger.ProcessorMetric(), other_proc->Dagger.root_worker_id(other_proc)==pid))
+                result = MT.cache_lookup(cache, Dagger, :execute!, MT.TimeMetric(), match_on)::Union{UInt64, Nothing}
+                if result !== nothing
+                    task_times[id, p] = result
+                    continue
+                end
+
+                # Try same proc type, any node
+                match_on = (MT.LookupExact(Dagger.SignatureMetric(), sig),
+                            MT.LookupSubtype(Dagger.ProcessorMetric(), typeof(proc)))
+                result = MT.cache_lookup(cache, Dagger, :execute!, MT.TimeMetric(), match_on)::Union{UInt64, Nothing}
+                if result !== nothing
+                    task_times[id, p] = result
+                    continue
+                end
+
+                # Try any signature match
+                match_on = MT.LookupExact(Dagger.SignatureMetric(), sig)
+                result = MT.cache_lookup(cache, Dagger, :execute!, MT.TimeMetric(), match_on)::Union{UInt64, Nothing}
+                if result !== nothing
+                    task_times[id, p] = result
+                    continue
+                end
+
+                # If no information is available, use a random guess
+                task_times[id, p] = UInt64(rand(1:1_000_000))
+            end
+        end
+
+        # FIXME: Actually fill this with estimated xfer times
+        @warn "Assuming all xfer times are 1" maxlog=1
+        for dst in 1:nprocs
+            for src in 1:nprocs
+                if src == dst # FIXME: Or if space is shared
+                    xfer_times[src, dst] = 0
+                else
+                    # FIXME: sum(currently non-local task arg size) / xfer_speed
+                    xfer_times[src, dst] = 1
+                end
+            end
+        end
+    end
+
+    @warn "If no edges exist, this will fail" maxlog=1
+    γ = Dict{Tuple{Int, Int}, Matrix{Int}}()
+    for (i, j) in Tuple.(edges(g))
+        γ[(i, j)] = copy(xfer_times)
+    end
+
+    a_kls = Tuple.(edges(g))
+    m = Model(sched.optimizer)
+    JuMP.set_silent(m)
+
+    # Start time of each task
+    @variable(m, t[1:ntasks] >= 0)
+    # End time of last task
+    @variable(m, t_last_end >= 0)
+
+    # 1 if task k is assigned to proc p
+    @variable(m, s[1:ntasks, 1:nprocs], Bin)
+
+    # Each task is assigned to exactly one processor
+    @constraint(m, [k in 1:ntasks], sum(s[k, :]) == 1)
+
+    # Penalties for moving between procs
+    if length(a_kls) > 0
+        @variable(m, p[a_kls] >= 0)
+
+        for (k, l) in a_kls
+            for p1 in 1:nprocs
+                for p2 in 1:nprocs
+                    p1 == p2 && continue
+                    # Task l occurs after task k if the procs are different,
+                    # thus there is a penalty
+                    @constraint(m, p[(k, l)] >= (s[k, p1] + s[l, p2] - 1) * γ[(k, l)][p1, p2])
+                end
+            end
+
+            # Task l occurs after task k
+            @constraint(m, t[k] + task_times[k, :]' * s[k, :] + p[(k, l)] <= t[l])
+        end
+    else
+        @variable(m, p >= 0)
+    end
+
+    for l in filter(n -> outdegree(g, n) == 0, 1:nv(g))
+        # DAG ends after the last task
+        @constraint(m, t[l] + task_times[l, :]' * s[l, :] <= t_last_end)
+    end
+
+    # Minimize the total runtime of the DAG
+    # TODO: Do we need to bias towards earlier start times?
+    @objective(m, Min, sched.Z*t_last_end + sum(t) .+ sum(p))
+
+    # Solve the model
+    optimize!(m)
+
+    # Extract the schedule from the model
+    task_to_proc = Dict{DTask, Dagger.Processor}()
+    for k in 1:ntasks
+        proc_id = findfirst(identity, value.(s[k, :]) .== 1)
+        task_to_proc[id_to_task[k]] = id_to_proc[proc_id]
+    end
+
+    return task_to_proc
+end
+
+end # module JuMPExt
diff --git a/lib/MetricsTracker/LICENSE.md b/lib/MetricsTracker/LICENSE.md
@@ -0,0 +1,22 @@
+MetricsTracker.jl is licensed under the MIT "Expat" License:
+
+> Copyright (c) 2024: Julian P Samaroo and contributors
+>
+> Permission is hereby granted, free of charge, to any person obtaining
+> a copy of this software and associated documentation files (the
+> "Software"), to deal in the Software without restriction, including
+> without limitation the rights to use, copy, modify, merge, publish,
+> distribute, sublicense, and/or sell copies of the Software, and to
+> permit persons to whom the Software is furnished to do so, subject to
+> the following conditions:
+>
+> The above copyright notice and this permission notice shall be
+> included in all copies or substantial portions of the Software.
+>
+> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+> EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+> MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+> IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+> CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+> TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+> SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/lib/MetricsTracker/Project.toml b/lib/MetricsTracker/Project.toml
@@ -0,0 +1,16 @@
+name = "MetricsTracker"
+uuid = "9a9c6fec-044d-4a27-aa18-2b01ca4026eb"
+authors = ["Julian P Samaroo <jpsamaroo@jpsamaroo.me>"]
+version = "0.1.0"
+
+[deps]
+MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
+ScopedValues = "7e506255-f358-4e82-b7e4-beb19740aa63"
+Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+TaskLocalValues = "ed4db957-447d-4319-bfb6-7fa9ae7ecf34"
+
+[compat]
+MacroTools = "0.5.13"
+ScopedValues = "1.2.1"
+Serialization = "1.11.0"
+TaskLocalValues = "0.1.1"
diff --git a/lib/MetricsTracker/src/MetricsTracker.jl b/lib/MetricsTracker/src/MetricsTracker.jl
@@ -0,0 +1,17 @@
+module MetricsTracker
+
+import MacroTools: @capture
+import ScopedValues: ScopedValue, @with
+import TaskLocalValues: TaskLocalValue
+
+include("types.jl")
+include("metrics.jl")
+include("lookup.jl")
+include("io.jl")
+include("builtins.jl")
+# FIXME
+#include("analysis.jl")
+#include("aggregate.jl")
+#include("decision.jl")
+
+end # module MetricsTracker
diff --git a/lib/MetricsTracker/src/aggregate.jl b/lib/MetricsTracker/src/aggregate.jl
@@ -0,0 +1,17 @@
+abstract type AbstractAggregator <: AbstractAnalysis end
+
+#### Built-in Aggregators ####
+
+struct SimpleAverageAggregator{T} <: AbstractAggregator
+    inner::T
+end
+required_metrics(agg::SimpleAverageAggregator, ::Val{context}, ::Val{op}) where {context,op} =
+    RequiredMetrics((context, op) => [agg.inner])
+function run_analysis(agg::SimpleAverageAggregator, ::Val{context}, ::Val{op}, @nospecialize(args...)) where {context,op}
+    prev = fetch_metric_cached(agg, context, op, args...)
+    next = fetch_metric(agg.inner, context, op, args...)
+    if prev === nothing || next === nothing
+        return next
+    end
+    return (prev + next) ÷ 2
+end
diff --git a/lib/MetricsTracker/src/analysis.jl b/lib/MetricsTracker/src/analysis.jl
@@ -0,0 +1,46 @@
+const RequiredMetrics = Dict{Tuple{Module,Symbol},Vector{AnalysisOrMetric}}
+const RequiredMetricsAny = Vector{AnalysisOrMetric}
+const NO_REQUIRED_METRICS = RequiredMetrics()
+required_metrics(::AnalysisOrMetric, _, _) = NO_REQUIRED_METRICS
+
+function fetch_metric(m::AnalysisOrMetric, mod::Module, context::Symbol, key, extra; cached=false)
+    @assert !COLLECTING_METRICS[] "Nesting analysis and metrics collection not yet supported"
+    # Check if this is already cached
+    cache = local_metrics_cache(mod, context, key)
+    if cached
+        return cache[m]
+    end
+    # FIXME: Proper invalidation support
+    if m isa AbstractMetric
+        if haskey(cache, m)
+            value = cache[m]
+            @debug "-- HIT for ($mod, $context) $m [$key] = $value"
+            return value
+        else
+            # The metric isn't available yet
+            @debug "-- MISS for ($mod, $context) $m [$key]"
+            return nothing
+        end
+    elseif m isa AbstractAnalysis
+        # Run the analysis
+        @debug "Running ($mod, $context) $m [$key]"
+        value = run_analysis(m, Val{nameof(mod)}(), Val{context}(), key, extra)
+        # TODO: Allocate the correct Dict type
+        get!(Dict, cache, m)[key] = value
+        @debug "Finished ($mod, $context) $m [$key] = $value"
+        return value
+    end
+end
+
+#### Built-in Analyses ####
+
+struct RuntimeWithoutCompilation <: AbstractAnalysis end
+required_metrics(::RuntimeWithoutCompilation) =
+    RequiredMetricsAny([TimeMetric(),
+                        CompileTimeMetric()])
+metric_type(::RuntimeWithoutCompilation) = UInt64
+function run_analysis(::RuntimeWithoutCompilation, mod, context, key, extra)
+    time = fetch_metric(TimeMetric(), mod, context, key, extra)
+    ctime = fetch_metric(CompileTimeMetric(), mod, context, key, extra)
+    return time - ctime[1]
+end
diff --git a/lib/MetricsTracker/src/builtins.jl b/lib/MetricsTracker/src/builtins.jl
@@ -0,0 +1,48 @@
+#### Built-in Metrics ####
+
+struct TimeMetric <: AbstractMetric end
+metric_applies(::TimeMetric, _) = true
+metric_type(::TimeMetric) = UInt64
+start_metric(::TimeMetric) = time_ns()
+stop_metric(::TimeMetric, last::UInt64) = time_ns() - last
+
+struct ThreadTimeMetric <: AbstractMetric end
+metric_applies(::ThreadTimeMetric, _) = true
+metric_type(::ThreadTimeMetric) = UInt64
+start_metric(::ThreadTimeMetric) = cputhreadtime()
+stop_metric(::ThreadTimeMetric, last::UInt64) = cputhreadtime() - last
+
+struct CompileTimeMetric <: AbstractMetric end
+metric_applies(::CompileTimeMetric, _) = true
+metric_type(::CompileTimeMetric) = Tuple{UInt64, UInt64}
+function start_metric(::CompileTimeMetric)
+    Base.cumulative_compile_timing(true)
+    return Base.cumulative_compile_time_ns()
+end
+function stop_metric(::CompileTimeMetric, last::Tuple{UInt64, UInt64})
+    Base.cumulative_compile_timing(false)
+    return Base.cumulative_compile_time_ns() .- last
+end
+
+struct AllocMetric <: AbstractMetric end
+metric_applies(::AllocMetric, _) = true
+metric_type(::AllocMetric) = Base.GC_Diff
+start_metric(::AllocMetric) = Base.gc_num()
+stop_metric(::AllocMetric, last::Base.GC_Num) = Base.GC_Diff(Base.gc_num(), last)
+
+struct ResultShapeMetric <: AbstractMetric end
+metric_applies(::ResultShapeMetric, _) = true
+metric_type(::ResultShapeMetric) = Union{Dims, Nothing}
+is_result_metric(::ResultShapeMetric) = true
+result_metric(m::ResultShapeMetric, result) =
+    result isa AbstractArray ? size(result) : nothing
+
+struct LoadAverageMetric <: AbstractMetric end
+metric_applies(::LoadAverageMetric, _) = true
+metric_type(::LoadAverageMetric) = Tuple{Float64, Float64, Float64}
+start_metric(::LoadAverageMetric) = nothing
+stop_metric(::LoadAverageMetric, _) = (Sys.loadavg()...,) ./ Sys.CPU_THREADS
+
+# TODO: Useful metrics to add
+# perf performance counters
+# BPF probe-collected metrics