ANDROID: cpufreq: track per-task time in state

Add time in state data to task structs, and create
/proc/<pid>/time_in_state files to show how long each individual task
has run at each frequency.
Create a CONFIG_CPU_FREQ_TIMES option to enable/disable this tracking.

Bug: 72339335
Bug: 127641090
Test: Read /proc/<pid>/time_in_state
Change-Id: Ia6456754f4cb1e83b2bc35efa8fbe9f8696febc8
Signed-off-by: Connor O'Brien <connoro@google.com>
[astrachan: Folded the following changes into this patch:
            a6d3de6a7fba ("ANDROID: Reduce use of #ifdef CONFIG_CPU_FREQ_TIMES")
            b89ada5d9c09 ("ANDROID: Fix massive cpufreq_times memory leaks")]
Signed-off-by: Alistair Strachan <astrachan@google.com>
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 608af20..e131237 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -37,6 +37,13 @@
 
 	  If in doubt, say N.
 
+config CPU_FREQ_TIMES
+       bool "CPU frequency time-in-state statistics"
+       help
+         Export CPU time-in-state information through procfs.
+
+         If in doubt, say N.
+
 choice
 	prompt "Default CPUFreq governor"
 	default CPU_FREQ_DEFAULT_GOV_USERSPACE if ARM_SA1100_CPUFREQ || ARM_SA1110_CPUFREQ
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index c1ffeab..648beca 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -5,7 +5,10 @@
 # CPUfreq stats
 obj-$(CONFIG_CPU_FREQ_STAT)             += cpufreq_stats.o
 
-# CPUfreq governors 
+# CPUfreq times
+obj-$(CONFIG_CPU_FREQ_TIMES)		+= cpufreq_times.o
+
+# CPUfreq governors
 obj-$(CONFIG_CPU_FREQ_GOV_PERFORMANCE)	+= cpufreq_performance.o
 obj-$(CONFIG_CPU_FREQ_GOV_POWERSAVE)	+= cpufreq_powersave.o
 obj-$(CONFIG_CPU_FREQ_GOV_USERSPACE)	+= cpufreq_userspace.o
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index f53fb41..98b5bac 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -19,6 +19,7 @@
 
 #include <linux/cpu.h>
 #include <linux/cpufreq.h>
+#include <linux/cpufreq_times.h>
 #include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/init.h>
@@ -349,6 +350,7 @@ static void cpufreq_notify_transition(struct cpufreq_policy *policy,
 		}
 
 		cpufreq_stats_record_transition(policy, freqs->new);
+		cpufreq_times_record_transition(freqs);
 		policy->cur = freqs->new;
 	}
 }
@@ -1295,6 +1297,7 @@ static int cpufreq_online(unsigned int cpu)
 			goto out_destroy_policy;
 
 		cpufreq_stats_create_table(policy);
+		cpufreq_times_create_policy(policy);
 
 		write_lock_irqsave(&cpufreq_driver_lock, flags);
 		list_add(&policy->policy_list, &cpufreq_policy_list);
diff --git a/drivers/cpufreq/cpufreq_times.c b/drivers/cpufreq/cpufreq_times.c
new file mode 100644
index 0000000..339a3e9
--- /dev/null
+++ b/drivers/cpufreq/cpufreq_times.c
@@ -0,0 +1,207 @@
+/* drivers/cpufreq/cpufreq_times.c
+ *
+ * Copyright (C) 2018 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/cpufreq_times.h>
+#include <linux/jiffies.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+
+static DEFINE_SPINLOCK(task_time_in_state_lock); /* task->time_in_state */
+
+/**
+ * struct cpu_freqs - per-cpu frequency information
+ * @offset: start of these freqs' stats in task time_in_state array
+ * @max_state: number of entries in freq_table
+ * @last_index: index in freq_table of last frequency switched to
+ * @freq_table: list of available frequencies
+ */
+struct cpu_freqs {
+	unsigned int offset;
+	unsigned int max_state;
+	unsigned int last_index;
+	unsigned int freq_table[0];
+};
+
+static struct cpu_freqs *all_freqs[NR_CPUS];
+
+static unsigned int next_offset;
+
+void cpufreq_task_times_init(struct task_struct *p)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&task_time_in_state_lock, flags);
+	p->time_in_state = NULL;
+	spin_unlock_irqrestore(&task_time_in_state_lock, flags);
+	p->max_state = 0;
+}
+
+void cpufreq_task_times_alloc(struct task_struct *p)
+{
+	void *temp;
+	unsigned long flags;
+	unsigned int max_state = READ_ONCE(next_offset);
+
+	/* We use one array to avoid multiple allocs per task */
+	temp = kcalloc(max_state, sizeof(p->time_in_state[0]), GFP_ATOMIC);
+	if (!temp)
+		return;
+
+	spin_lock_irqsave(&task_time_in_state_lock, flags);
+	p->time_in_state = temp;
+	spin_unlock_irqrestore(&task_time_in_state_lock, flags);
+	p->max_state = max_state;
+}
+
+/* Caller must hold task_time_in_state_lock */
+static int cpufreq_task_times_realloc_locked(struct task_struct *p)
+{
+	void *temp;
+	unsigned int max_state = READ_ONCE(next_offset);
+
+	temp = krealloc(p->time_in_state, max_state * sizeof(u64), GFP_ATOMIC);
+	if (!temp)
+		return -ENOMEM;
+	p->time_in_state = temp;
+	memset(p->time_in_state + p->max_state, 0,
+	       (max_state - p->max_state) * sizeof(u64));
+	p->max_state = max_state;
+	return 0;
+}
+
+void cpufreq_task_times_exit(struct task_struct *p)
+{
+	unsigned long flags;
+	void *temp;
+
+	spin_lock_irqsave(&task_time_in_state_lock, flags);
+	temp = p->time_in_state;
+	p->time_in_state = NULL;
+	spin_unlock_irqrestore(&task_time_in_state_lock, flags);
+	kfree(temp);
+}
+
+int proc_time_in_state_show(struct seq_file *m, struct pid_namespace *ns,
+	struct pid *pid, struct task_struct *p)
+{
+	unsigned int cpu, i;
+	u64 cputime;
+	unsigned long flags;
+	struct cpu_freqs *freqs;
+	struct cpu_freqs *last_freqs = NULL;
+
+	spin_lock_irqsave(&task_time_in_state_lock, flags);
+	for_each_possible_cpu(cpu) {
+		freqs = all_freqs[cpu];
+		if (!freqs || freqs == last_freqs)
+			continue;
+		last_freqs = freqs;
+
+		seq_printf(m, "cpu%u\n", cpu);
+		for (i = 0; i < freqs->max_state; i++) {
+			if (freqs->freq_table[i] == CPUFREQ_ENTRY_INVALID)
+				continue;
+			cputime = 0;
+			if (freqs->offset + i < p->max_state &&
+			    p->time_in_state)
+				cputime = p->time_in_state[freqs->offset + i];
+			seq_printf(m, "%u %lu\n", freqs->freq_table[i],
+				   (unsigned long)nsec_to_clock_t(cputime));
+		}
+	}
+	spin_unlock_irqrestore(&task_time_in_state_lock, flags);
+	return 0;
+}
+
+void cpufreq_acct_update_power(struct task_struct *p, u64 cputime)
+{
+	unsigned long flags;
+	unsigned int state;
+	struct cpu_freqs *freqs = all_freqs[task_cpu(p)];
+
+	if (!freqs || p->flags & PF_EXITING)
+		return;
+
+	state = freqs->offset + READ_ONCE(freqs->last_index);
+
+	spin_lock_irqsave(&task_time_in_state_lock, flags);
+	if ((state < p->max_state || !cpufreq_task_times_realloc_locked(p)) &&
+	    p->time_in_state)
+		p->time_in_state[state] += cputime;
+	spin_unlock_irqrestore(&task_time_in_state_lock, flags);
+}
+
+void cpufreq_times_create_policy(struct cpufreq_policy *policy)
+{
+	int cpu, index;
+	unsigned int count = 0;
+	struct cpufreq_frequency_table *pos, *table;
+	struct cpu_freqs *freqs;
+	void *tmp;
+
+	if (all_freqs[policy->cpu])
+		return;
+
+	table = policy->freq_table;
+	if (!table)
+		return;
+
+	cpufreq_for_each_entry(pos, table)
+		count++;
+
+	tmp =  kzalloc(sizeof(*freqs) + sizeof(freqs->freq_table[0]) * count,
+		       GFP_KERNEL);
+	if (!tmp)
+		return;
+
+	freqs = tmp;
+	freqs->max_state = count;
+
+	index = cpufreq_frequency_table_get_index(policy, policy->cur);
+	if (index >= 0)
+		WRITE_ONCE(freqs->last_index, index);
+
+	cpufreq_for_each_entry(pos, table)
+		freqs->freq_table[pos - table] = pos->frequency;
+
+	freqs->offset = next_offset;
+	WRITE_ONCE(next_offset, freqs->offset + count);
+	for_each_cpu(cpu, policy->related_cpus)
+		all_freqs[cpu] = freqs;
+}
+
+void cpufreq_times_record_transition(struct cpufreq_freqs *freq)
+{
+	int index;
+	struct cpu_freqs *freqs = all_freqs[freq->cpu];
+	struct cpufreq_policy *policy;
+
+	if (!freqs)
+		return;
+
+	policy = cpufreq_cpu_get(freq->cpu);
+	if (!policy)
+		return;
+
+	index = cpufreq_frequency_table_get_index(policy, freq->new);
+	if (index >= 0)
+		WRITE_ONCE(freqs->last_index, index);
+
+	cpufreq_cpu_put(policy);
+}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 7e9f07bf..b51e81e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -94,6 +94,7 @@
 #include <linux/sched/stat.h>
 #include <linux/flex_array.h>
 #include <linux/posix-timers.h>
+#include <linux/cpufreq_times.h>
 #include <trace/events/oom.h>
 #include "internal.h"
 #include "fd.h"
@@ -3006,6 +3007,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_LIVEPATCH
 	ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
 #endif
+#ifdef CONFIG_CPU_FREQ_TIMES
+	ONE("time_in_state", 0444, proc_time_in_state_show),
+#endif
 };
 
 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3384,6 +3388,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_LIVEPATCH
 	ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
 #endif
+#ifdef CONFIG_CPU_FREQ_TIMES
+	ONE("time_in_state", 0444, proc_time_in_state_show),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/include/linux/cpufreq_times.h b/include/linux/cpufreq_times.h
new file mode 100644
index 0000000..bfef6e6
--- /dev/null
+++ b/include/linux/cpufreq_times.h
@@ -0,0 +1,41 @@
+/* drivers/cpufreq/cpufreq_times.c
+ *
+ * Copyright (C) 2018 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_CPUFREQ_TIMES_H
+#define _LINUX_CPUFREQ_TIMES_H
+
+#include <linux/cpufreq.h>
+#include <linux/pid.h>
+
+#ifdef CONFIG_CPU_FREQ_TIMES
+void cpufreq_task_times_init(struct task_struct *p);
+void cpufreq_task_times_alloc(struct task_struct *p);
+void cpufreq_task_times_exit(struct task_struct *p);
+int proc_time_in_state_show(struct seq_file *m, struct pid_namespace *ns,
+			    struct pid *pid, struct task_struct *p);
+void cpufreq_acct_update_power(struct task_struct *p, u64 cputime);
+void cpufreq_times_create_policy(struct cpufreq_policy *policy);
+void cpufreq_times_record_transition(struct cpufreq_freqs *freq);
+#else
+static inline void cpufreq_task_times_init(struct task_struct *p) {}
+static inline void cpufreq_task_times_alloc(struct task_struct *p) {}
+static inline void cpufreq_task_times_exit(struct task_struct *p) {}
+static inline void cpufreq_acct_update_power(struct task_struct *p,
+					     u64 cputime) {}
+static inline void cpufreq_times_create_policy(struct cpufreq_policy *policy) {}
+static inline void cpufreq_times_record_transition(
+	struct cpufreq_freqs *freq) {}
+#endif /* CONFIG_CPU_FREQ_TIMES */
+#endif /* _LINUX_CPUFREQ_TIMES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 977cb57..97f4d64 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -799,6 +799,10 @@ struct task_struct {
 	u64				stimescaled;
 #endif
 	u64				gtime;
+#ifdef CONFIG_CPU_FREQ_TIMES
+	u64				*time_in_state;
+	unsigned int			max_state;
+#endif
 	struct prev_cputime		prev_cputime;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 	struct vtime			vtime;
diff --git a/kernel/fork.c b/kernel/fork.c
index f0b5847..65acaa2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -91,6 +91,7 @@
 #include <linux/kcov.h>
 #include <linux/livepatch.h>
 #include <linux/thread_info.h>
+#include <linux/cpufreq_times.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -394,6 +395,8 @@ void put_task_stack(struct task_struct *tsk)
 
 void free_task(struct task_struct *tsk)
 {
+	cpufreq_task_times_exit(tsk);
+
 #ifndef CONFIG_THREAD_INFO_IN_TASK
 	/*
 	 * The task is finally done with both the stack and thread_info,
@@ -1708,6 +1711,8 @@ static __latent_entropy struct task_struct *copy_process(
 	if (!p)
 		goto fork_out;
 
+	cpufreq_task_times_init(p);
+
 	/*
 	 * This _must_ happen before we call free_task(), i.e. before we jump
 	 * to any of the bad_fork_* labels. This is to avoid freeing
@@ -2170,6 +2175,8 @@ long _do_fork(unsigned long clone_flags,
 	if (IS_ERR(p))
 		return PTR_ERR(p);
 
+	cpufreq_task_times_alloc(p);
+
 	/*
 	 * Do this prior waking up the new thread - the thread pointer
 	 * might get invalid after that point, if the thread exits quickly.
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 0796f93..ca4208d 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -1,6 +1,7 @@
 /*
  * Simple CPU accounting cgroup controller
  */
+#include <linux/cpufreq_times.h>
 #include "sched.h"
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -128,6 +129,9 @@ void account_user_time(struct task_struct *p, u64 cputime)
 
 	/* Account for user time used */
 	acct_account_cputime(p);
+
+	/* Account power usage for user time */
+	cpufreq_acct_update_power(p, cputime);
 }
 
 /*
@@ -172,6 +176,9 @@ void account_system_index_time(struct task_struct *p,
 
 	/* Account for system time used */
 	acct_account_cputime(p);
+
+	/* Account power usage for system time */
+	cpufreq_acct_update_power(p, cputime);
 }
 
 /*