/* * Preferred average time interval between consecutive invocations of * the driver to set the frequency for this policy. To be set by the * scaling driver (0, which is the default, means no preference). */ unsignedint transition_delay_us;
/* * Remote DVFS flag (Not added to the driver structure as we don't want * to access another structure from scheduler hotpath). * * Should be set if CPUs can do DVFS on behalf of other CPUs from * different cpufreq policies. */ bool dvfs_possible_from_any_cpu;
/* Cached frequency lookup from cpufreq_driver_resolve_freq. */ unsignedint cached_target_freq; int cached_resolved_idx;
/* cpufreq-stats */ structcpufreq_stats *stats;
/* For cpufreq driver's internal use */ void *driver_data;
在kconfig中(CPU Power Management -> CPU Frequency scaling)可以对cpufreq进行配置,可以配置支持的governor及系统默认的governor,以及cpufreq调频driver,例如Phytium E2000 5.10内核的配置如下,默认使用schedutil governor,根据调度器所提供的CPU利用率信息进行电压/频率调节,EAS能源感知依赖该governor工作:
假设一个CPU设备支持如下的电压和频率关系: {300MHz at minimum voltage of 1V} {800MHz at minimum voltage of 1.2V} {1GHz at minimum voltage of 1.3V} 用OPP表示就可以用{Hz, uV}方式表示如下: {300000000, 1000000} {800000000, 1200000} {1000000000, 1300000}
// include/linux/scmi_protocol.h /** * struct scmi_perf_ops - represents the various operations provided * by SCMI Performance Protocol * * @limits_set: sets limits on the performance level of a domain * @limits_get: gets limits on the performance level of a domain * @level_set: sets the performance level of a domain * @level_get: gets the performance level of a domain * @device_domain_id: gets the scmi domain id for a given device * @transition_latency_get: gets the DVFS transition latency for a given device * @device_opps_add: adds all the OPPs for a given device * @freq_set: sets the frequency for a given device using sustained frequency * to sustained performance level mapping * @freq_get: gets the frequency for a given device using sustained frequency * to sustained performance level mapping * @est_power_get: gets the estimated power cost for a given performance domain * at a given frequency */ structscmi_perf_ops { int (*limits_set)(const struct scmi_handle *handle, u32 domain, u32 max_perf, u32 min_perf); int (*limits_get)(const struct scmi_handle *handle, u32 domain, u32 *max_perf, u32 *min_perf); int (*level_set)(const struct scmi_handle *handle, u32 domain, u32 level, bool poll); int (*level_get)(const struct scmi_handle *handle, u32 domain, u32 *level, bool poll); int (*device_domain_id)(struct device *dev); int (*transition_latency_get)(const struct scmi_handle *handle, struct device *dev); int (*device_opps_add)(const struct scmi_handle *handle, struct device *dev); int (*freq_set)(const struct scmi_handle *handle, u32 domain, unsignedlong rate, bool poll); int (*freq_get)(const struct scmi_handle *handle, u32 domain, unsignedlong *rate, bool poll); int (*est_power_get)(const struct scmi_handle *handle, u32 domain, unsignedlong *rate, unsignedlong *power); bool (*fast_switch_possible)(const struct scmi_handle *handle, struct device *dev); };
// drivers/opp/core.c /* * The root of the list of all opp-tables. All opp_table structures branch off * from here, with each opp_table containing the list of opps it supports in * various states of availability. */ // opp_tables是opp_table链表的头节点 LIST_HEAD(opp_tables);
/* Check if this CPU already has a policy to manage it */ // cpufreq_cpu_data是类型为policy指针的precpu变量 // 这是是取得policy指针 policy = per_cpu(cpufreq_cpu_data, cpu); if (policy) { // 假如该cpu不在该policy的related_cpus里面则是有问题的 WARN_ON(!cpumask_test_cpu(cpu, policy->related_cpus)); // 判断当前policy还有没有online CPU if (!policy_is_inactive(policy)) // 将当前cpu加入到policy->cpu online CPU里面 return cpufreq_add_policy_cpu(policy, cpu);
/* This is the only online CPU for the policy. Start over. */ new_policy = false; down_write(&policy->rwsem); policy->cpu = cpu; policy->governor = NULL; up_write(&policy->rwsem); } else { // 第一次开机的时候需要分配policy内存 new_policy = true; policy = cpufreq_policy_alloc(cpu); if (!policy) return -ENOMEM; }
// scmi调频驱动没有实现online接口 if (!new_policy && cpufreq_driver->online) { ret = cpufreq_driver->online(policy); if (ret) { pr_debug("%s: %d: initialization failed\n", __func__, __LINE__); goto out_exit_policy; }
/* * Call driver. From then on the cpufreq must be able * to accept all calls to ->verify and ->setpolicy for this CPU. */ // cpufreq_driver是在cpufreq_register_driver()中进行赋值的 // 调频驱动初始化 ret = cpufreq_driver->init(policy); if (ret) { pr_debug("%s: %d: initialization failed\n", __func__, __LINE__); goto out_free_policy; }
ret = cpufreq_table_validate_and_sort(policy); if (ret) goto out_exit_policy;
/* related_cpus should at least include policy->cpus. */ cpumask_copy(policy->related_cpus, policy->cpus); }
down_write(&policy->rwsem); /* * affected cpus must always be the one, which are online. We aren't * managing offline cpus here. */ cpumask_and(policy->cpus, policy->cpus, cpu_online_mask);
policy->min_freq_req = kzalloc(2 * sizeof(*policy->min_freq_req), GFP_KERNEL); if (!policy->min_freq_req) goto out_destroy_policy;
ret = freq_qos_add_request(&policy->constraints, policy->min_freq_req, FREQ_QOS_MIN, policy->min); if (ret < 0) { /* * So we don't call freq_qos_remove_request() for an * uninitialized request. */ kfree(policy->min_freq_req); policy->min_freq_req = NULL; goto out_destroy_policy; }
/* * This must be initialized right here to avoid calling * freq_qos_remove_request() on uninitialized request in case * of errors. */ policy->max_freq_req = policy->min_freq_req + 1;
ret = freq_qos_add_request(&policy->constraints, policy->max_freq_req, FREQ_QOS_MAX, policy->max); if (ret < 0) { policy->max_freq_req = NULL; goto out_destroy_policy; }
if (cpufreq_driver->get && has_target()) { policy->cur = cpufreq_driver->get(policy->cpu); if (!policy->cur) { pr_err("%s: ->get() failed\n", __func__); goto out_destroy_policy; } }
/* * Sometimes boot loaders set CPU frequency to a value outside of * frequency table present with cpufreq core. In such cases CPU might be * unstable if it has to run on that frequency for long duration of time * and so its better to set it to a frequency which is specified in * freq-table. This also makes cpufreq stats inconsistent as * cpufreq-stats would fail to register because current frequency of CPU * isn't found in freq-table. * * Because we don't want this change to effect boot process badly, we go * for the next freq which is >= policy->cur ('cur' must be set by now, * otherwise we will end up setting freq to lowest of the table as 'cur' * is initialized to zero). * * We are passing target-freq as "policy->cur - 1" otherwise * __cpufreq_driver_target() would simply fail, as policy->cur will be * equal to target-freq. */ if ((cpufreq_driver->flags & CPUFREQ_NEED_INITIAL_FREQ_CHECK) && has_target()) { unsignedint old_freq = policy->cur;
/* Are we running at unknown frequency ? */ ret = cpufreq_frequency_table_get_index(policy, old_freq); if (ret == -EINVAL) { ret = __cpufreq_driver_target(policy, old_freq - 1, CPUFREQ_RELATION_L);
/* * Reaching here after boot in a few seconds may not * mean that system will remain stable at "unknown" * frequency for longer duration. Hence, a BUG_ON(). */ BUG_ON(ret); pr_info("%s: CPU%d: Running at unlisted initial frequency: %u KHz, changing to: %u KHz\n", __func__, policy->cpu, old_freq, policy->cur); } }
if (new_policy) { ret = cpufreq_add_dev_interface(policy); if (ret) goto out_destroy_policy;
// 获取CPU device结构体即对应percpu变量cpu_sys_devices cpu_dev = get_cpu_device(policy->cpu); if (!cpu_dev) { pr_err("failed to get cpu%d device\n", policy->cpu); return -ENODEV; }
// 调用scmi_dvfs_device_opps_add()生成opp表 ret = handle->perf_ops->device_opps_add(handle, cpu_dev); if (ret) { dev_warn(cpu_dev, "failed to add opps to the device\n"); return ret; } // 遍历可用的cpu,对比cpu_dev的domain_id,若是相同的话加入到policy->cpus里面 ret = scmi_get_sharing_cpus(cpu_dev, policy->cpus); if (ret) { dev_warn(cpu_dev, "failed to get sharing cpumask\n"); return ret; } // 将opp表添加到sharing cpus里面 ret = dev_pm_opp_set_sharing_cpus(cpu_dev, policy->cpus); if (ret) { dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n", __func__, ret); return ret; }
nr_opp = dev_pm_opp_get_opp_count(cpu_dev); if (nr_opp <= 0) { dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n"); ret = -EPROBE_DEFER; goto out_free_opp; }
priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) { ret = -ENOMEM; goto out_free_opp; } // cpufreq ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table); if (ret) { dev_err(cpu_dev, "failed to init cpufreq table: %d\n", ret); goto out_free_priv; }
/* Common Governor data across policies */ // 抽象出的ondemand governor结构体 // drivers/cpufreq/cpufreq_governor.h structdbs_governor { structcpufreq_governorgov; structkobj_typekobj_type; /* * Common data for platforms that don't set * CPUFREQ_HAVE_GOVERNOR_PER_POLICY */ structdbs_data *gdbs_data;
/* Governor demand based switching data (per-policy or global). */ // ondemand计算频率使用的相关参数,包括阈值 采样率等,默认阈值是负载的80% // dbs(demand based switching)按需切换 structdbs_data { structgov_attr_setattr_set; void *tuners; unsignedint ignore_nice_load; unsignedint sampling_rate; unsignedint sampling_down_factor; unsignedint up_threshold; unsignedint io_is_busy; };
/* Common to all CPUs of a policy */ // driver/cpufreq/cpufreq_governor.h structpolicy_dbs_info { structcpufreq_policy *policy; /* * Per policy mutex that serializes load evaluation from limit-change * and work-handler. */ structmutexupdate_mutex;
u64 last_sample_time; s64 sample_delay_ns; atomic_t work_count; structirq_workirq_work; structwork_structwork; /* dbs_data may be shared between multiple policy objects */ structdbs_data *dbs_data; structlist_headlist; /* Multiplier for increasing sample delay temporarily. */ unsignedint rate_mult; unsignedint idle_periods; /* For conservative */ /* Status indicators */ bool is_shared; /* This object is used by multiple CPUs */ bool work_in_progress; /* Work is being queued up or in progress */ };
/** * cpufreq_update_util - Take a note about CPU utilization changes. * @rq: Runqueue to carry out the update for. * @flags: Update reason flags. * * This function is called by the scheduler on the CPU whose utilization is * being updated. * * It can only be called from RCU-sched read-side critical sections. * * The way cpufreq is currently arranged requires it to evaluate the CPU * performance state (frequency/voltage) on a regular basis to prevent it from * being stuck in a completely inadequate performance level for too long. * That is not guaranteed to happen if the updates are only triggered from CFS * and DL, though, because they may not be coming in if only RT tasks are * active all the time (or there are RT tasks only). * * As a workaround for that issue, this function is called periodically by the * RT sched class to trigger extra cpufreq updates to prevent it from stalling, * but that really is a band-aid. Going forward it should be replaced with * solutions targeted more specifically at RT tasks. */ // kernel/sched/sched.h // 当cpufreq_update_util()被调用时执行上面设置的回调函数update_util_data->func // 为了考虑RT任务的影响,目前在RT调度中会周期性调用该函数,避免CPU频率更新不及时 staticinlinevoidcpufreq_update_util(struct rq *rq, unsignedint flags) { structupdate_util_data *data; data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, cpu_of(rq))); if (data) data->func(data, rq_clock(rq), flags); }
// 检查当前工作的CPU可否对该policy对应的CPU进行调频,E2000是支持的 if (!cpufreq_this_cpu_can_update(policy_dbs->policy)) return;
/* * The work may not be allowed to be queued up right now. * Possible reasons: * - Work has already been queued up or is in progress. * - It is too early (too little time from the previous sample). */ if (policy_dbs->work_in_progress) return;
/* * If the reads below are reordered before the check above, the value * of sample_delay_ns used in the computation may be stale. */ // 判断更新的时间间隔,假如小于smaple_delay_ns直接返回 smp_rmb(); lst = READ_ONCE(policy_dbs->last_sample_time); delta_ns = time - lst; if ((s64)delta_ns < policy_dbs->sample_delay_ns) return;
/* * If the policy is not shared, the irq_work may be queued up right away * at this point. Otherwise, we need to ensure that only one of the * CPUs sharing the policy will do that. */ // 若policy是被多个CPU共享的,那么要判断一下是不是已经有CPU对其进行处理过了 if (policy_dbs->is_shared) { if (!atomic_add_unless(&policy_dbs->work_count, 1, 1)) return;
/* * If another CPU updated last_sample_time in the meantime, we * shouldn't be here, so clear the work counter and bail out. */ if (unlikely(lst != READ_ONCE(policy_dbs->last_sample_time))) { atomic_set(&policy_dbs->work_count, 0); return; } }
/* * Make sure cpufreq_governor_limits() isn't evaluating load or the * ondemand governor isn't updating the sampling rate in parallel. */ // 调用gov->gov_dbs_update()接口 mutex_lock(&policy_dbs->update_mutex); gov_update_sample_delay(policy_dbs, gov->gov_dbs_update(policy)); mutex_unlock(&policy_dbs->update_mutex);
/* Allow the utilization update handler to queue up more work. */ atomic_set(&policy_dbs->work_count, 0); /* * If the update below is reordered with respect to the sample delay * modification, the utilization update handler may end up using a stale * sample delay value. */ smp_wmb(); policy_dbs->work_in_progress = false; }
/* Common NORMAL_SAMPLE setup */ dbs_info->sample_type = OD_NORMAL_SAMPLE; /* * OD_SUB_SAMPLE doesn't make sense if sample_delay_ns is 0, so ignore * it then. */ if (sample_type == OD_SUB_SAMPLE && policy_dbs->sample_delay_ns > 0) { __cpufreq_driver_target(policy, dbs_info->freq_lo, CPUFREQ_RELATION_H); return dbs_info->freq_lo_delay_us; }
/* * Every sampling_rate, we check, if current idle time is less than 20% * (default), then we try to increase frequency. Else, we adjust the frequency * proportional to load. */ // drivers/cpufreq/cpufreq_ondemand.c // 根据CPU负载,调整频率 staticvoidod_update(struct cpufreq_policy *policy) { structpolicy_dbs_info *policy_dbs = policy->governor_data; structod_policy_dbs_info *dbs_info = to_dbs_info(policy_dbs); structdbs_data *dbs_data = policy_dbs->dbs_data; structod_dbs_tuners *od_tuners = dbs_data->tuners; unsignedint load = dbs_update(policy);
dbs_info->freq_lo = 0;
/* Check for frequency increase */ // 检查当前的CPU负载,负载大于80%(也可以在sysfs中设置) if (load > dbs_data->up_threshold) { /* If switching to max speed, apply sampling_down_factor */ if (policy->cur < policy->max) policy_dbs->rate_mult = dbs_data->sampling_down_factor; dbs_freq_increase(policy, policy->max); } else { /* Calculate the next frequency proportional to load */ unsignedint freq_next, min_f, max_f;
/* * Sometimes governors may use an additional multiplier to increase * sample delays temporarily. Apply that multiplier to sampling_rate * so as to keep the wake-up-from-idle detection logic a bit * conservative. */ sampling_rate = dbs_data->sampling_rate * policy_dbs->rate_mult; /* * For the purpose of ondemand, waiting for disk IO is an indication * that you're performance critical, and not that the system is actually * idle, so do not add the iowait time to the CPU idle time then. */ io_busy = dbs_data->io_is_busy;
// 这里主要是对各种比较罕见的情况进行临时处理 if (unlikely(!time_elapsed)) { /* * That can only happen when this function is called * twice in a row with a very short interval between the * calls, so the previous load value can be used then. */ load = j_cdbs->prev_load; } elseif (unlikely((int)idle_time > 2 * sampling_rate && j_cdbs->prev_load)) { /* * If the CPU had gone completely idle and a task has * just woken up on this CPU now, it would be unfair to * calculate 'load' the usual way for this elapsed * time-window, because it would show near-zero load, * irrespective of how CPU intensive that task actually * was. This is undesirable for latency-sensitive bursty * workloads. * * To avoid this, reuse the 'load' from the previous * time-window and give this task a chance to start with * a reasonably high CPU frequency. However, that * shouldn't be over-done, lest we get stuck at a high * load (high frequency) for too long, even when the * current system load has actually dropped down, so * clear prev_load to guarantee that the load will be * computed again next time. * * Detecting this situation is easy: an unusually large * 'idle_time' (as compared to the sampling rate) * indicates this scenario. */ load = j_cdbs->prev_load; j_cdbs->prev_load = 0; // 程序正常会运行到这里进行负载计算 } else { if (time_elapsed >= idle_time) { load = 100 * (time_elapsed - idle_time) / time_elapsed; } else { /* * That can happen if idle_time is returned by * get_cpu_idle_time_jiffy(). In that case * idle_time is roughly equal to the difference * between time_elapsed and "busy time" obtained * from CPU statistics. Then, the "busy time" * can end up being greater than time_elapsed * (for example, if jiffies_64 and the CPU * statistics are updated by different CPUs), * so idle_time may in fact be negative. That * means, though, that the CPU was busy all * the time (on the rough average) during the * last sampling interval and 100 can be * returned as the load. */ load = (int)idle_time < 0 ? 100 : 0; } j_cdbs->prev_load = load; }
if (unlikely((int)idle_time > 2 * sampling_rate)) { unsignedint periods = idle_time / sampling_rate;
if (periods < idle_periods) idle_periods = periods; } // 取当前policy中CPU负载最大的值 if (load > max_load) max_load = load; }
/* The next fields are only needed if fast switch cannot be used: */ structirq_workirq_work; structkthread_workwork; structmutexwork_lock; structkthread_workerworker; structtask_struct *thread; bool work_in_progress;
/* * OTOH, for energy computation we need the estimated running time, so * include util_dl and ignore dl_bw. */ if (type == ENERGY_UTIL) util += dl_util;
/* * There is still idle time; further improve the number by using the * irq metric. Because IRQ/steal time is hidden from the task clock we * need to scale the task numbers: * * max - irq * U' = irq + --------- * U * max */ // irq会偷走一部分的cpu算力,从而让其capacity没有那么大。 // 这里通过scale_irq_capacity对任务的utility进行调整 util = scale_irq_capacity(util, irq, max); util += irq;
/* * Bandwidth required by DEADLINE must always be granted while, for * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism * to gracefully reduce the frequency when no tasks show up for longer * periods of time. * * Ideally we would like to set bw_dl as min/guaranteed freq and util + * bw_dl as requested freq. However, cpufreq is not yet ready for such * an interface. So, we only do the latter for now. */ if (type == FREQUENCY_UTIL) util += cpu_bw_dl(rq);
ARM推荐的测试CPU的性能工具:Dhrystone 2.1以上版本,可以通过单核跑分成绩作为capacity-dmips-mhz属性的参考,DMIPS: Dhrystone Million Instructions executed Per Second,表示了在Dhrystone这样一种测试方法下的MIPS,Dhrystone是一种整数运算测试程序。MIPS/MHz,就是说每MHz频率能产生多大的MIPS,CPU性能通常由每秒百万指令(Millions of Instructions Per Second,MIPS)表示,设备树里表示为dmips/mhz
// 该root_domain是否处于overutilized状态 int overutilized;
/* * The bit corresponding to a CPU gets set here if such CPU has more * than one runnable -deadline task (as it is below for RT tasks). */ cpumask_var_t dlo_mask; atomic_t dlo_count; structdl_bwdl_bw; structcpudlcpudl;
#ifdef HAVE_RT_PUSH_IPI /* * For IPI pull requests, loop across the rto_mask. */ structirq_workrto_push_work; raw_spinlock_t rto_lock; /* These are only updated and read within rto_lock */ int rto_loop; int rto_cpu; /* These atomics are updated outside of a lock */ atomic_t rto_loop_next; atomic_t rto_loop_start; #endif /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. */ cpumask_var_t rto_mask; structcpupricpupri;
// 系统中算力最大的CPU的算力 unsignedlong max_cpu_capacity;
/* * NULL-terminated list of performance domains intersecting with the * CPUs of the rd. Protected by RCU. */ // perf_domain单链表的表头 structperf_domain __rcu *pd; };
// kernel/sched/topology.c staticboolbuild_perf_domains(const struct cpumask *cpu_map) { // 这里nr_cpus即统计cpu_map里面当前CPU的个数 int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map); structperf_domain *pd =NULL, *tmp; int cpu = cpumask_first(cpu_map); structroot_domain *rd = cpu_rq(cpu)->rd; structcpufreq_policy *policy; structcpufreq_governor *gov;
if (!sysctl_sched_energy_aware) gotofree;
/* EAS is enabled for asymmetric CPU capacity topologies. */ if (!per_cpu(sd_asym_cpucapacity, cpu)) { if (sched_debug()) { pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n", cpumask_pr_args(cpu_map)); } gotofree; }
/* EAS definitely does *not* handle SMT */ if (sched_smt_active()) { pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n", cpumask_pr_args(cpu_map)); gotofree; }
/* Do not attempt EAS if schedutil is not being used. */ policy = cpufreq_cpu_get(i); if (!policy) gotofree; gov = policy->governor; cpufreq_cpu_put(policy); if (gov != &schedutil_gov) { if (rd->pd) pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n", cpumask_pr_args(cpu_map)); gotofree; }
/* Create the new pd and add it to the local list. */ tmp = pd_init(i); if (!tmp) gotofree; tmp->next = pd; pd = tmp;
/* * Count performance domains and performance states for the * complexity check. */ nr_pd++; nr_ps += em_pd_nr_perf_states(pd->em_pd); }
/* Bail out if the Energy Model complexity is too high. */ if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) { WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n", cpumask_pr_args(cpu_map)); gotofree; }
perf_domain_debug(cpu_map, pd);
/* Attach the new list of performance domains to the root domain. */ tmp = rd->pd; rcu_assign_pointer(rd->pd, pd); if (tmp) call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
/* * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) * to @dst_cpu. */ staticunsignedlongcpu_util_next(int cpu, struct task_struct *p, int dst_cpu) { structcfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; // 计算cfs的util unsignedlong util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
/* * If @p migrates from @cpu to another, remove its contribution. Or, * if @p migrates from another CPU to @cpu, add its contribution. In * the other cases, @cpu is not impacted by the migration, so the * util_avg should already be correct. */ // 在dst_cpu为-1的情况下 // 若任务p运行在传入的CPU util = cfs_util - task_util(p) if (task_cpu(p) == cpu && dst_cpu != cpu) sub_positive(&util, task_util(p)); elseif (task_cpu(p) != cpu && dst_cpu == cpu) util += task_util(p);
if (sched_feat(UTIL_EST)) { util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
/* * During wake-up, the task isn't enqueued yet and doesn't * appear in the cfs_rq->avg.util_est.enqueued of any rq, * so just add it (if needed) to "simulate" what will be * cpu_util() after the task has been enqueued. */ if (dst_cpu == cpu) util_est += _task_util_est(p);
util = max(util, util_est); }
return min(util, capacity_orig_of(cpu)); }
/* * compute_energy(): Estimates the energy that @pd would consume if @p was * migrated to @dst_cpu. compute_energy() predicts what will be the utilization * landscape of @pd's CPUs after the task migration, and uses the Energy Model * to compute what would be the energy if we decided to actually migrate that * task. */ // 计算任务迁移到dst_cpu后,整个pd,即此cluster的energy staticlong compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) { structcpumask *pd_mask = perf_domain_span(pd); // 获取该CPU的算力,这里取得的是归一化到1024的算力 unsignedlong cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); unsignedlong max_util = 0, sum_util = 0; int cpu;
// 计算该pd下所有CPU的功耗和 staticinlineunsignedlongem_cpu_energy(struct em_perf_domain *pd, unsignedlong max_util, unsignedlong sum_util) { unsignedlong freq, scale_cpu; structem_perf_state *ps; int i, cpu;
/* * In order to predict the performance state, map the utilization of * the most utilized CPU of the performance domain to a requested * frequency, like schedutil. */ cpu = cpumask_first(to_cpumask(pd->cpus)); scale_cpu = arch_scale_cpu_capacity(cpu); ps = &pd->table[pd->nr_perf_states - 1]; // 这里将perf_domai计算出来的最大的CPU利用率来推测CPU接下来需要调频的频率 // 这里可以将freq称为推测频率 // 计算公式 freq = 1.25 * max_f * max_util / scale_cpu // 这里是在最大频率1.25倍进行计算的 freq = map_util_freq(max_util, ps->frequency, scale_cpu);
/* * Find the lowest performance state of the Energy Model above the * requested frequency. */ // 将该频率映射到频率表上 // 因为freq是在1.5倍最大频率计算的,算出来的推测频率可能大于最大频率 // 假如大于最大频率的话,for循环执行后ps指向的就是最大频率了 for (i = 0; i < pd->nr_perf_states; i++) { ps = &pd->table[i]; if (ps->frequency >= freq) break; }
/* * The capacity of a CPU in the domain at the performance state (ps) * can be computed as: * * ps->freq * scale_cpu * ps->cap = -------------------- (1) * cpu_max_freq * * So, ignoring the costs of idle states (which are not available in * the EM), the energy consumed by this CPU at that performance state * is estimated as: * * ps->power * cpu_util * cpu_nrg = -------------------- (2) * ps->cap * * since 'cpu_util / ps->cap' represents its percentage of busy time. * * NOTE: Although the result of this computation actually is in * units of power, it can be manipulated as an energy value * over a scheduling period, since it is assumed to be * constant during that interval. * * By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product * of two terms: * * ps->power * cpu_max_freq cpu_util * cpu_nrg = ------------------------ * --------- (3) * ps->freq scale_cpu * * The first term is static, and is stored in the em_perf_state struct * as 'ps->cost'. * * Since all CPUs of the domain have the same micro-architecture, they * share the same 'ps->cost', and the same CPU capacity. Hence, the * total energy of the domain (which is the simple sum of the energy of * all of its CPUs) can be factorized as: * * ps->cost * \Sum cpu_util * pd_nrg = ------------------------ (4) * scale_cpu */ // 根据推测频率在频点表上对应的cost来算energy // 不同频率的cost是常数,在初始化时就已经计算出来 // 计算cost的时候需要将该perf_domain上的所有util拿出来进行计算 // 一个perf_domain下的频率是一样的,所以这里计算能耗直接用了sum_util return ps->cost * sum_util / scale_cpu; }
/* * Energy-aware wake-up happens on the lowest sched_domain starting * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu. */ sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity)); while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) sd = sd->parent; if (!sd) goto fail;
sync_entity_load_avg(&p->se); // p的util为0,直接返回prev_cpu if (!task_util_est(p)) goto unlock;
/* * Skip CPUs that cannot satisfy the capacity request. * IOW, placing the task there would make the CPU * overutilized. Take uclamp into account to see how * much capacity we can get out of the CPU; this is * aligned with schedutil_cpu_util(). */ util = uclamp_rq_util_with(cpu_rq(cpu), util, p); // CPU需要保留20%左右的算力,不满足需求后进行下一个CPU的探测 if (!fits_capacity(util, cpu_cap)) continue;
/* Always use prev_cpu as a candidate. */ // 若对比的这个CPU就是任务之前运行的CPU if (cpu == prev_cpu) { // 计算p放在该cpu后整个pd的能量消耗 // 这里传入的dst_cpu为之前p运行的CPU // 推测频率是在之前运行CPU进行推测,同时把P的util也计算到了 prev_delta = compute_energy(p, prev_cpu, pd); // 计算p放在该CPU后整个pd增加的能量消耗 prev_delta -= base_energy_pd; // 更新best_delta,取最优能耗 best_delta = min(best_delta, prev_delta); }
/* * Find the CPU with the maximum spare capacity in * the performance domain */ // 记录p放上去后剩余算力最大的CPU和最大的剩余算力 if (spare_cap > max_spare_cap) { max_spare_cap = spare_cap; max_spare_cap_cpu = cpu; } }
/* Evaluate the energy impact of using this CPU. */ // 同一个簇上的CPU取最大余量算力的那个CPU与其他簇的CPU做能量消耗对比 if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) { // 计算p放在算力剩余最大的CPU后整个pd的能量消耗 // 这里传进去的dst_cpu肯定不会和当前任务p运行的CPU相同 // 所以在cpu_util_next()中会把p的util加到cpu util上 cur_delta = compute_energy(p, max_spare_cap_cpu, pd); // 计算能量消耗增量 cur_delta -= base_energy_pd; // 如果当前能量增量优于p放在prev_cpu运行的能量消耗,则取该cpu运行p if (cur_delta < best_delta) { best_delta = cur_delta; best_energy_cpu = max_spare_cap_cpu; } } } unlock: rcu_read_unlock();
/* * Pick the best CPU if prev_cpu cannot be used, or if it saves at * least 6% of the energy used by prev_cpu. */ // 若prev_cpu找不到,就直接返回最优能耗cpu if (prev_delta == ULONG_MAX) return best_energy_cpu;