I want to reserve some memory to save kernel information. I copied reserve_crashkernel function to arm64 and modified it:
/* 16M alignment for crash kernel regions */
#define CRASH_ALIGN (16 << 20)
/* Location of the reserved area for the crash kernel */
struct resource crashk_res = {
.name = "Crash kernel",
.start = 0,
.end = 0,
.flags = IORESOURCE_MEM
};
static void __init reserve_crashkernel(void)
{
unsigned long long crash_size, crash_base, total_mem;
int ret;
crash_size = CRASH_ALIGN;
total_mem = memblock_phys_mem_size();
pr_info("crashkernel find memory %x - %llx.\n", CRASH_ALIGN, memblock_end_of_DRAM());
crash_base = memblock_find_in_range(CRASH_ALIGN, memblock_end_of_DRAM(),
crash_size, CRASH_ALIGN);
if (!crash_base) {
pr_info("crashkernel reservation failed - No suitable area found.\n");
return;
}
ret = memblock_reserve(crash_base, crash_size);
if (ret) {
pr_err("%s: Error reserving crashkernel memblock.\n", __func__);
return;
}
pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n",
(unsigned long)(crash_size >> 20),
(unsigned long)(crash_base >> 20),
(unsigned long)(total_mem >> 20));
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
insert_resource(&iomem_resource, &crashk_res);
}
When the kernel started, I can find kernel print like this:
[ 0.000000] crashkernel find memory 1000000 - 210000000.
[ 0.000000] Reserving 16MB of memory at 8272MB for crashkernel (System RAM: 8190MB)
But the /proc/iomem doesn't seem right. Without my code there is a 'System RAM' region:
100000000-20fffffff : System RAM
Now with reserve_crashkernel, the region changed to:
205000000-205ffffff : Crash kernel
I don't why the 'System RAM' region disappeared and I'm not sure that my code is correct.
Related
I have this device tree.
reserved-memory {
#address-cells = <2>;
#size-cells = <2>;
ranges;
axpu_reserved_mem: axpursvd#90000000 {
no-map;
reg = <0x0 0x90000000 0x0 0x30000000>;
};
};
axpu#50000000 {
compatible = "ab21-axpu";
reg = <0 0x50000000 0 0x10000000>;
...
memory-region = <&axpu_reserved_mem>;
};
With simple test, in the probe function, I did something like this (reduced).
struct axpu_dev {
struct device *dev;
void __iomem *base;
u64 paddr;
u64 vaddr;
};
static int axpu_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
struct resource *res;
struct axpu_dev *axpu;
struct device_node *np;
int rc;
int ret;
res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
axpu = devm_kzalloc(dev, sizeof(*axpu), GFP_KERNEL);
axpu->dev = dev;
axpu->base = devm_ioremap(dev, res->start, resource_size(res));
res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
devm_request_irq(dev, res->start, axpu_irq_handler, IRQF_TRIGGER_HIGH, "axpu_irq", axpu);
np = of_parse_phandle(dev->of_node, "memory-region", 0);
rc = of_address_to_resource(np, 0, res);
axpu->paddr = res->start;
axpu->vaddr = memremap(res->start, resource_size(res), MEMREMAP_WB);
platform_set_drvdata(pdev, axpu);
axpu_init(axpu);
return sysfs_create_group(&dev->kobj, &axpu_attr_group);
}
and in the axpu_init function, I access it with virtual address.
static void axpu_init(struct axpu_dev *axpu)
{
printk("testing reserved memory ..\n");
writel_relaxed(0x12345678, axpu->vaddr + 0);
writel_relaxed(0x23456789, axpu->vaddr + 8);
printk("read-back data = %llx, %llx\n", readl_relaxed(axpu->vaddr + 0), readl_relaxed(axpu->vaddr + 8));
}
When I execute it, it runs ok.
/ # insmod axpu.ko
axpu_probe called!
MEM : res->start = 50000000, res->end = 5fffffff, res->name = axpu#50000000
axpu->base = ffffffc010000000
IRQ : res->start = 15, res->end = 15, res->name = axpu#50000000
axpu_dev 50000000.axpu: Allocated reserved memory, vaddr: 0xFFFFFFC080000000, paddr: 0x90000000
writing 0x12345678 at non-mapped reserved memory 0x90000000, 0x90000008 ..
read-back data = 12345678, 23456789
I have put no-map in the reserved memory sub-node. But is accessible with virtual address.
But if I try it with physical address, it crashes as below.
------------[ cut here ]------------
Ignoring spurious kernel translation fault at virtual address 0000000090000000
WARNING: CPU: 0 PID: 27 at arch/arm64/mm/fault.c:311 __do_kernel_fault+0x108/0x150
odules linked in: axpu(+)
CPU: 0 PID: 27 Comm: insmod Not tainted 5.10.0-rc5 #548
Hardware name: ETRI ab21m (DT)
pstate: 60400009 (nZCv daif +PAN -UAO -TCO BTYPE=--)
pc : __do_kernel_fault+0x108/0x150
lr : __do_kernel_fault+0x108/0x150
The document says about no-map :
no-map (optional) - empty property
- Indicates the operating system must not create a virtual mapping
of the region as part of its standard mapping of system memory,
nor permit speculative access to it under any circumstances other
than under the control of the device driver using the region.
Then what is wrong with my device tree?
I develop a kernel module using DMA dma_alloc_coherent() and remap_pfn_range().
Sometimes, when I close the app that opened the character device, I get the following message in dmesg. That leads to a kernel panic few seconds (random) later.
[ 3275.772330] BUG: Bad page map in process gnome-shell pte:b3e05275201 pmd:238adf067
[ 3275.772337] addr:00007f20bce00000 vm_flags:08000070 anon_vma: (null) mapping:ffff969f236dcdd0 index:b8
[ 3275.772375] vma->vm_ops->fault: xfs_filemap_fault+0x0/0x30 [xfs]
[ 3275.772400] vma->vm_file->f_op->mmap: xfs_file_mmap+0x0/0x80 [xfs]
[ 3275.772413] CPU: 5 PID: 4809 Comm: gnome-shell Kdump: loaded Tainted: G OE ------------ 3.10.0-1127.19.1.el7.x86_64 #1
[ 3275.772416] Hardware name: System manufacturer System Product Name/PRIME H370M-PLUS, BIOS 1801 10/17/2019
[ 3275.772417] Call Trace:
[ 3275.772425] [<ffffffffbb97ffa5>] dump_stack+0x19/0x1b
[ 3275.772432] [<ffffffffbb3ee311>] print_bad_pte+0x1f1/0x290
[ 3275.772436] [<ffffffffbb3f0676>] vm_normal_page+0xa6/0xb0
[ 3275.772440] [<ffffffffbb3f0ccb>] unmap_page_range+0x64b/0xc80
[ 3275.772444] [<ffffffffbb3f1381>] unmap_single_vma+0x81/0xf0
[ 3275.772448] [<ffffffffbb3f2db9>] unmap_vmas+0x49/0x90
[ 3275.772454] [<ffffffffbb3fcdbc>] exit_mmap+0xac/0x1a0
[ 3275.772458] [<ffffffffbb454db5>] ? flush_old_exec+0x3b5/0x950
[ 3275.772463] [<ffffffffbb298667>] mmput+0x67/0xf0
[ 3275.772467] [<ffffffffbb454f00>] flush_old_exec+0x500/0x950
[ 3275.772472] [<ffffffffbb4b38d0>] load_elf_binary+0x340/0xdb0
[ 3275.772476] [<ffffffffbb52cd53>] ? ima_get_action+0x23/0x30
[ 3275.772479] [<ffffffffbb52c26e>] ? process_measurement+0x8e/0x250
[ 3275.772482] [<ffffffffbb52c729>] ? ima_bprm_check+0x49/0x50
[ 3275.772486] [<ffffffffbb45454a>] search_binary_handler+0x9a/0x1c0
[ 3275.772490] [<ffffffffbb455c56>] do_execve_common.isra.24+0x616/0x880
[ 3275.772493] [<ffffffffbb456159>] SyS_execve+0x29/0x30
[ 3275.772498] [<ffffffffbb993478>] stub_execve+0x48/0x80
Here the process is gnome-shell but that doesn't mean anything, I saw lots of different processes, it can be anything.
In BUG: Bad page map in process gnome-shell pte:b3e05275201 pmd:238adf067
238adf067 is the base physical address of a coherent memory allocated by my driver, with an offset of 0x67 (0x238adf000)
All those messages always come with the physical address and 0x67 offset!
Those prints are generated from here: https://github.com/torvalds/linux/blob/7cf726a59435301046250c42131554d9ccc566b8/mm/memory.c#L536
I tried to remove everything useless, here is the code showing the order I call API functions:
const struct file_operations pcie_fops = {
.owner = THIS_MODULE,
.open = chr_open,
.release = chr_release,
.mmap = chr_mmap,
};
static int chr_open(struct inode *inode, struct file *filp) {
struct custom_data *custom_data;
struct chr_dev_bookkeep *chr_dev_bk;
chr_dev_bk = container_of(inode->i_cdev, struct chr_dev_bookkeep, cdev);
custom_data = kzalloc(sizeof(*custom_data), GFP_KERNEL);
filp->private_data = custom_data;
return 0;
}
static int chr_release(struct inode *inode, struct file *filp) {
struct custom_data *custom_data;
custom_data = filp->private_data;
// ==========================> FREED HERE <================================
dma_free_coherent(&pdev->dev, size, virt_addr, bus_addr);
filp->private_data = NULL;
kfree(custom_data);
return 0;
}
static int chr_mmap(struct file *filp, struct vm_area_struct *vma) {
int ret;
struct custom_data *custom_data;
custom_data = filp->private_data;
chr_dev_bk = custom_data->chr_dev_bk;
vm_len = PAGE_ALIGN(vma->vm_end - vma->vm_start);
vma->vm_flags |= VM_PFNMAP | VM_DONTCOPY | VM_DONTEXPAND;
vma->vm_private_data = custom_data;/*not really used because no vm_operations_struct.close*/
virt_addr = dma_alloc_coherent(&pdev->dev, vm_len, &bus_addr, GFP_KERNEL | __GFP_ZERO);
set_memory_uc((unsigned long)virt_addr, (vm_len / PAGE_SIZE));
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
ret = remap_pfn_range(vma, vma->vm_start,
bus_addr >> PAGE_SHIFT,
vm_len,
vma->vm_page_prot);
return ret;
}
I believe that bug can't come from my application code.
mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, filehandler, 0);.
Edit:
The #Ian Abbott's comment made me look at the kernel source code and I found a comment at dma_mmap_attrs() (=dma_mmap_coherent()): https://elixir.bootlin.com/linux/v4.7/source/include/linux/dma-mapping.h#L309
The coherent DMA buffer must not be freed by the
driver until the user space mapping has been released.
Has been released, I guess that means not during the release.
I believe that's also true for memory mapped with remap_pfn_range().
I'll write an answer if that was the problem.
I'm trying to emulate the function lookup_address(http://lxr.free-electrons.com/source/arch/x86/mm/pageattr.c#L373) for arm platform (and just for kernel pagetables).
The point is I'm getting the address of swapper_pg_dir from TTBR1, and so far that's working.
I checked it with gdb:
(gdb) file vmlinux
Reading symbols from vmlinux...done.
(gdb) p init_mm.pgd
$1 = (pgd_t *) 0xc0004000
(gdb)
and the code from my module:
static pgd_t *get_global_pgd (void)
{
pgd_t *pgd;
unsigned int ttb_reg;
asm volatile (
" mrc p15, 0, %0, c2, c0, 1"
: "=r" (ttb_reg));
ttb_reg &= TTBR_MASK;
pgd = __va (ttb_reg);
pr_info ("get_global_pgd: %p\n", pgd);
return pgd;
}
and the output:
bananapi kernel: [ 5665.358139] mod: get_global_pgd: c0004000
So far, this is matching.
Now I'm computing the addr of the right pgd, doing:
pgd = get_global_pgd() + pgd_index (addr);
And since (addr >> 21) is 0x600, I get 0xc0007000.
Then I continue with:
pud = pud_offset (pgd, addr);
pr_info ("pud: 0x%0x - %p\n",pud_val (*pud), pud);
pmd = pmd_offset (pud, addr);
pr_info ("pmd: 0x%0x - %p\n", pmd_val (*pmd), pmd);
if (pmd == NULL || pmd_none (*pmd)) {
return NULL;
}
return pte_offset_kernel (pmd, addr);
output:
bananapi kernel: [ 5665.390391] mod: pud: 0x4001140e - c0007000
bananapi kernel: [ 5665.401603] mod: pmd: 0x4001140e - c0007000
bananapi kernel: [ 5665.423838] mod: pte: 0xe59f119c - c0011020
The problem is that the pte I get seems not to be fine, because the attributes of the pte don't match.
Let's take an address from /proc/kallsyms:
c0008054 t __create_page_tables
I can read it with gdb:
(gdb) x/2x 0xc0008054
0xc0008054 <__create_page_tables>: 0xe2884901 0xe1a00004
(gdb)
But the pte I get from this address, doesn't have the present flag:
I'm checking it with (this pte is the one I got from my lookup_address):
ret = pte_present (*pte);
pr_info ("pte_present: %d\n", ret);
The pte_present is 0 (which checks L_PTE_PRESENT flag defined include/asm/pgtable-2level.h), but shoudln't be 0 as long as I can read from it in GDB.
I've tested with some other addresses, for instance: 0xc0035618:
c0035618 T __put_task_struct
And for this one the L_PTE_PRESENT big is set.
I'm pretty sure I'm missing something, or I got it wrong.
Thanks in advance!
I've read all the pointed links, but I'm afraig I still don't get the whole picture.
I'll try to explain what I understood so far:
From include/asm/pgtable-2level.h , it looks like a page stores:
0 - pte1_linux
1024 - pte2_linux
2048 - pte1_hw
3072 - pte2_hw
Actually I also saw this in early_pte_alloc function, which allocates 4096bytes for the pte:
static pte_t * __init early_pte_alloc(pmd_t *pmd, unsigned long addr, unsigned long prot)
{
if (pmd_none(*pmd)) {
pte_t *pte = early_alloc(PTE_HWTABLE_OFF + PTE_HWTABLE_SIZE);
__pmd_populate(pmd, __pa(pte), prot);
}
BUG_ON(pmd_bad(*pmd));
return pte_offset_kernel(pmd, addr);
}
Then, in __pmd_populate we take the phys address of the memory previously allocated, we add 2048 (for hw pte), and we OR it with the protection flag (which in case to be from a kernel_domain should be PMD_TYPE_TABLE.
static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t pte,
pmdval_t prot)
{
pmdval_t pmdval = (pte + PTE_HWTABLE_OFF) | prot;
pmdp[0] = __pmd(pmdval);
#ifndef CONFIG_ARM_LPAE
pmdp[1] = __pmd(pmdval + 256 * sizeof(pte_t));
#endif
flush_pmd_entry(pmdp);
}
So far is clear.
Given this information, a walking page should be something like:
pmd_offset_k (addr)
pud_offset (pgd, addr)
pmd_offset (pud, addr)
pte_offset_kernel (pmd, addr)
pte_offset_kernel gives the virtual address of the pmd_val stored in pmd. (it also ANDs the value with PHYS_MASK and PAGE_MASK), and adds the pte_index (addr).
At this point I should have the value of the virtual addres of the linux_pte_0 (because the previous ANDs with PAGE_MASK brought me to the top of the page).
So I think at this point I should be able to check the L_PTE_* flags.
Am I wrong?
Thanks in advance
I would like to allocate piece of physically contiguous reserved memory (in predefined physical addresses) for my device with DMA support.
As I see CMA has three options:
1. To reserve memory via kernel config file. 2. To reserve memory via kernel cmdline. 3. To reserve memory via device-tree memory node.
In the first case: size and number of areas could be reserved.
CONFIG_DMA_CMA=y
CONFIG_CMA_AREAS=7
CONFIG_CMA_SIZE_MBYTES=8
So I could use:
start_cma_virt = dma_alloc_coherent(dev->cmadev, (size_t)size_cma, &start_cma_dma, GFP_KERNEL);
in my driver to allocate contiguous memory. I could use it max 7 times and it will be possible allocate up to 8M. But unfortunately
dma_contiguous_reserve(min(arm_dma_limit, arm_lowmem_limit));
from arch/arm/mm/init.c:
void __init arm_memblock_init(struct meminfo *mi,const struct machine_desc *mdesc)
it is impossible to set predefined physical addresses for contiguous allocation.
Of Course I could use kernel cmdline:
mem=cma=cmadevlabel=8M#32M cma_map=mydevname=cmadevlabel
//struct device *dev = cmadev->dev; /*dev->name is mydevname*/
After that dma_alloc_coherent() should alloc memory in physical memory area from 32M + 8M (0x2000000 + 0x800000) up to 0x27FFFFF.
But unfortunately I have problem with this solution. Maybe my cmdline has error?
Next one try was device tree implementation.
cmadev_region: mycma {
/*no-map;*/ /*DMA coherent memory*/
/*reusable;*/
reg = <0x02000000 0x00100000>;
};
And phandle in some node:
memory-region = <&cmadev_region>;
As I saw in kernel usual it should be used like:
of_find_node_by_name(); //find needed node
of_parse_phandle(); //resolve a phandle property to a device_node pointer
of_get_address(); //get DT __be32 physical addresses
of_translate_address(); //DT represent local (bus, device) addresses so translate it to CPU physical addresses
request_mem_region(); //reserve IOMAP memory (cat /proc/iomem)
ioremap(); //alloc entry in page table for reserved memory and return kernel logical addresses.
But I want use DMA via (as I know only one external API function dma_alloc_coherent) dma_alloc_coherent() instead IO-MAP ioremap(). But how call
start_cma_virt = dma_alloc_coherent(dev->cmadev, (size_t)size_cma, &start_cma_dma, GFP_KERNEL);
associate memory from device-tree (reg = <0x02000000 0x00100000>;) to dev->cmadev ? In case with cmdline it is clear it has device name and addresses region.
Does reserved memory after call of_parse_phandle() automatically should be booked for your special driver (which parse DT). And next call dma_alloc_coherent will allocate dma area inside memory from cmadev_region: mycma?
To use dma_alloc_coherent() on reserved memory node, you need to declare that area as dma_coherent. You can do some thing like:
In dt:
cmadev_region: mycma {
compatible = "compatible-name"
no-map;
reg = <0x02000000 0x00100000>;
};
In your driver:
struct device *cma_dev;
static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev)
{
int ret;
if (!mem) {
ret = dma_declare_coherent_memory(cma_dev, rmem->base, rmem->base,
rmem->size, DMA_MEMORY_EXCLUSIVE);
if (ret) {
pr_err("Error");
return ret;
}
}
return 0;
}
static void rmem_dma_device_release(struct reserved_mem *rmem,
struct device *dev)
{
if (dev)
dev->dma_mem = NULL;
}
static const struct reserved_mem_ops rmem_dma_ops = {
.device_init = rmem_dma_device_init,
.device_release = rmem_dma_device_release,
};
int __init cma_setup(struct reserved_mem *rmem)
{
rmem->ops = &rmem_dma_ops;
return 0;
}
RESERVEDMEM_OF_DECLARE(some-name, "compatible-name", cma_setup);
Now on this cma_dev you can perform dma_alloc_coherent and get memory.
I have an embedded Linux system running on an Atom, which is a new enough CPU to have an invariant TSC (time stamp counter), whose frequency the kernel measures on startup. I use the TSC in my own code to keep time (avoiding kernel calls), and my startup code measures the TSC rate, but I'd rather just use the kernel's measurement. Is there any way to retrieve this from the kernel? It's not in /proc/cpuinfo anywhere.
BPFtrace
As root, you can retrieve the kernel's TSC rate with bpftrace:
# bpftrace -e 'BEGIN { printf("%u\n", *kaddr("tsc_khz")); exit(); }' | tail -n
(tested it on CentOS 7 and Fedora 29)
That is the value that is defined, exported and maintained/calibrated in arch/x86/kernel/tsc.c.
GDB
Alternatively, also as root, you can also read it from /proc/kcore, e.g.:
# gdb /dev/null /proc/kcore -ex 'x/uw 0x'$(grep '\<tsc_khz\>' /proc/kallsyms \
| cut -d' ' -f1) -batch 2>/dev/null | tail -n 1 | cut -f2
(tested it on CentOS 7 and Fedora 29)
SystemTap
If the system doesn't have bpftrace nor gdb available but SystemTap you can get it like this (as root):
# cat tsc_khz.stp
#!/usr/bin/stap -g
function get_tsc_khz() %{ /* pure */
THIS->__retvalue = tsc_khz;
%}
probe oneshot {
printf("%u\n", get_tsc_khz());
}
# ./tsc_khz.stp
Of course, you can also write a small kernel module that provides access to tsc_khz via the /sys pseudo file system. Even better, somebody already did that and a tsc_freq_khz module is available on GitHub. With that the following should work:
# modprobe tsc_freq_khz
$ cat /sys/devices/system/cpu/cpu0/tsc_freq_khz
(tested on Fedora 29, reading the sysfs file doesn't require root)
Kernel Messages
In case nothing of the above is an option you can parse the TSC rate from the kernel logs. But this gets ugly fast because you see different kinds of messages on different hardware and kernels, e.g. on a Fedora 29 i7 system:
$ journalctl --boot | grep 'kernel: tsc:' -i | cut -d' ' -f5-
kernel: tsc: Detected 2800.000 MHz processor
kernel: tsc: Detected 2808.000 MHz TSC
But on a Fedora 29 Intel Atom just:
kernel: tsc: Detected 2200.000 MHz processor
While on a CentOS 7 i5 system:
kernel: tsc: Fast TSC calibration using PIT
kernel: tsc: Detected 1895.542 MHz processor
kernel: tsc: Refined TSC clocksource calibration: 1895.614 MHz
Perf Values
The Linux Kernel doesn't provide an API to read the TSC rate, yet. But it does provide one for getting the mult and shift values that can be used to convert TSC counts to nanoseconds. Those values are derived from tsc_khz - also in arch/x86/kernel/tsc.c - where tsc_khz is initialized and calibrated. And they are shared with userspace.
Example program that uses the perf API and accesses the shared page:
#include <asm/unistd.h>
#include <inttypes.h>
#include <linux/perf_event.h>
#include <stdio.h>
#include <sys/mman.h>
#include <unistd.h>
static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
int cpu, int group_fd, unsigned long flags)
{
return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
}
The actual code:
int main(int argc, char **argv)
{
struct perf_event_attr pe = {
.type = PERF_TYPE_HARDWARE,
.size = sizeof(struct perf_event_attr),
.config = PERF_COUNT_HW_INSTRUCTIONS,
.disabled = 1,
.exclude_kernel = 1,
.exclude_hv = 1
};
int fd = perf_event_open(&pe, 0, -1, -1, 0);
if (fd == -1) {
perror("perf_event_open failed");
return 1;
}
void *addr = mmap(NULL, 4*1024, PROT_READ, MAP_SHARED, fd, 0);
if (!addr) {
perror("mmap failed");
return 1;
}
struct perf_event_mmap_page *pc = addr;
if (pc->cap_user_time != 1) {
fprintf(stderr, "Perf system doesn't support user time\n");
return 1;
}
printf("%16s %5s\n", "mult", "shift");
printf("%16" PRIu32 " %5" PRIu16 "\n", pc->time_mult, pc->time_shift);
close(fd);
}
Tested in on Fedora 29 and it works also for non-root users.
Those values can be used to convert a TSC count to nanoseconds with a function like this one:
static uint64_t mul_u64_u32_shr(uint64_t cyc, uint32_t mult, uint32_t shift)
{
__uint128_t x = cyc;
x *= mult;
x >>= shift;
return x;
}
CPUID/MSR
Another way to obtain the TSC rate is to follow DPDK's lead.
DPDK on x86_64 basically uses the following strategy:
Read the 'Time Stamp Counter and Nominal Core Crystal Clock Information Leaf' via cpuid intrinsics (doesn't require special privileges), if available
Read it from the MSR (requires the rawio capability and read permissions on /dev/cpu/*/msr), if possible
Calibrate it in userspace by other means, otherwise
FWIW, a quick test shows that the cpuid leaf doesn't seem to be that widely available, e.g. an i7 Skylake and a goldmont atom don't have it. Otherwise, as can be seen from the DPDK code, using the MSR requires a bunch of intricate case distinctions.
However, in case the program already uses DPDK, getting the TSC rate, getting TSC values or converting TSC values is just a matter of using the right DPDK API.
I had a brief look and there doesn't seem to be a built-in way to directly get this information from the kernel.
However, the symbol tsc_khz (which I'm guessing is what you want) is exported by the kernel. You could write a small kernel module that exposes a sysfs interface and use that to read out the value of tsc_khz from userspace.
If writing a kernel module is not an option, it may be possible to use some Dark Magicâ„¢ to read out the value directly from the kernel memory space. Parse the kernel binary or System.map file to find the location of the tsc_khz symbol and read it from /dev/{k}mem. This is, of course, only possible provided that the kernel is configured with the appropriate options.
Lastly, from reading the kernel source comments, it looks like there's a possibility that the TSC may be unstable on some platforms. I don't know much about the inner workings of the x86 arch but this may be something you want to take into consideration.
The TSC rate is directly related to "cpu MHz" in /proc/cpuinfo. Actually, the better number to use is "bogomips". The reason is that while the freq for TSC is the max CPU freq, the current "cpu Mhz" can vary at time of your invocation.
The bogomips value is computed at boot. You'll need to adjust this value by number of cores and processor count (i.e. the number of hyperthreads) That gives you [fractional] MHz. That is what I use to do what you want to do.
To get the processor count, look for the last "processor: " line. The processor count is <value> + 1. Call it "cpu_count".
To get number of cores, any "cpu cores: " works. number of cores is <value>. Call it "core_count".
So, the formula is:
smt_count = cpu_count;
if (core_count)
smt_count /= core_count;
cpu_freq_in_khz = (bogomips * scale_factor) / smt_count;
That is extracted from my actual code, which is below.
Here's the actual code I use. You won't be able to use it directly because it relies on boilerplate I have, but it should give you some ideas, particularly with how to compute
// syslgx/tvtsc -- system time routines (RDTSC)
#include <tgb.h>
#include <zprt.h>
tgb_t systvinit_tgb[] = {
{ .tgb_val = 1, .tgb_tag = "cpu_mhz" },
{ .tgb_val = 2, .tgb_tag = "bogomips" },
{ .tgb_val = 3, .tgb_tag = "processor" },
{ .tgb_val = 4, .tgb_tag = "cpu_cores" },
{ .tgb_val = 5, .tgb_tag = "clflush_size" },
{ .tgb_val = 6, .tgb_tag = "cache_alignment" },
TGBEOT
};
// _systvinit -- get CPU speed
static void
_systvinit(void)
{
const char *file;
const char *dlm;
XFIL *xfsrc;
int matchflg;
char *cp;
char *cur;
char *rhs;
char lhs[1000];
tgb_pc tgb;
syskhz_t khzcpu;
syskhz_t khzbogo;
syskhz_t khzcur;
sysmpi_p mpi;
file = "/proc/cpuinfo";
xfsrc = fopen(file,"r");
if (xfsrc == NULL)
sysfault("systvinit: unable to open '%s' -- %s\n",file,xstrerror());
dlm = " \t";
khzcpu = 0;
khzbogo = 0;
mpi = &SYS->sys_cpucnt;
SYSZAPME(mpi);
// (1) look for "cpu MHz : 3192.515" (preferred)
// (2) look for "bogomips : 3192.51" (alternate)
// FIXME/CAE -- on machines with speed-step, bogomips may be preferred (or
// disable it)
while (1) {
cp = fgets(lhs,sizeof(lhs),xfsrc);
if (cp == NULL)
break;
// strip newline
cp = strchr(lhs,'\n');
if (cp != NULL)
*cp = 0;
// look for symbol value divider
cp = strchr(lhs,':');
if (cp == NULL)
continue;
// split symbol and value
*cp = 0;
rhs = cp + 1;
// strip trailing whitespace from symbol
for (cp -= 1; cp >= lhs; --cp) {
if (! XCTWHITE(*cp))
break;
*cp = 0;
}
// convert "foo bar" into "foo_bar"
for (cp = lhs; *cp != 0; ++cp) {
if (XCTWHITE(*cp))
*cp = '_';
}
// match on interesting data
matchflg = 0;
for (tgb = systvinit_tgb; TGBMORE(tgb); ++tgb) {
if (strcasecmp(lhs,tgb->tgb_tag) == 0) {
matchflg = tgb->tgb_val;
break;
}
}
if (! matchflg)
continue;
// look for the value
cp = strtok_r(rhs,dlm,&cur);
if (cp == NULL)
continue;
zprt(ZPXHOWSETUP,"_systvinit: GRAB/%d lhs='%s' cp='%s'\n",
matchflg,lhs,cp);
// process the value
// NOTE: because of Intel's speed step, take the highest cpu speed
switch (matchflg) {
case 1: // genuine CPU speed
khzcur = _systvinitkhz(cp);
if (khzcur > khzcpu)
khzcpu = khzcur;
break;
case 2: // the consolation prize
khzcur = _systvinitkhz(cp);
// we've seen some "wild" values
if (khzcur > 10000000)
break;
if (khzcur > khzbogo)
khzbogo = khzcur;
break;
case 3: // remember # of cpu's so we can adjust bogomips
mpi->mpi_cpucnt = atoi(cp);
mpi->mpi_cpucnt += 1;
break;
case 4: // remember # of cpu cores so we can adjust bogomips
mpi->mpi_corecnt = atoi(cp);
break;
case 5: // cache flush size
mpi->mpi_cshflush = atoi(cp);
break;
case 6: // cache alignment
mpi->mpi_cshalign = atoi(cp);
break;
}
}
fclose(xfsrc);
// we want to know the number of hyperthreads
mpi->mpi_smtcnt = mpi->mpi_cpucnt;
if (mpi->mpi_corecnt)
mpi->mpi_smtcnt /= mpi->mpi_corecnt;
zprt(ZPXHOWSETUP,"_systvinit: FINAL khzcpu=%d khzbogo=%d mpi_cpucnt=%d mpi_corecnt=%d mpi_smtcnt=%d mpi_cshalign=%d mpi_cshflush=%d\n",
khzcpu,khzbogo,mpi->mpi_cpucnt,mpi->mpi_corecnt,mpi->mpi_smtcnt,
mpi->mpi_cshalign,mpi->mpi_cshflush);
if ((mpi->mpi_cshalign == 0) || (mpi->mpi_cshflush == 0))
sysfault("_systvinit: cache parameter fault\n");
do {
// use the best reference
// FIXME/CAE -- with speed step, bogomips is better
#if 0
if (khzcpu != 0)
break;
#endif
khzcpu = khzbogo;
if (mpi->mpi_smtcnt)
khzcpu /= mpi->mpi_smtcnt;
if (khzcpu != 0)
break;
sysfault("_systvinit: unable to obtain cpu speed\n");
} while (0);
systvkhz(khzcpu);
zprt(ZPXHOWSETUP,"_systvinit: EXIT\n");
}
// _systvinitkhz -- decode value
// RETURNS: CPU freq in khz
static syskhz_t
_systvinitkhz(char *str)
{
char *src;
char *dst;
int rhscnt;
char bf[100];
syskhz_t khz;
zprt(ZPXHOWSETUP,"_systvinitkhz: ENTER str='%s'\n",str);
dst = bf;
src = str;
// get lhs of lhs.rhs
for (; *src != 0; ++src, ++dst) {
if (*src == '.')
break;
*dst = *src;
}
// skip over the dot
++src;
// get rhs of lhs.rhs and determine how many rhs digits we have
rhscnt = 0;
for (; *src != 0; ++src, ++dst, ++rhscnt)
*dst = *src;
*dst = 0;
khz = atol(bf);
zprt(ZPXHOWSETUP,"_systvinitkhz: PRESCALE bf='%s' khz=%d rhscnt=%d\n",
bf,khz,rhscnt);
// scale down (e.g. we got xxxx.yyyy)
for (; rhscnt > 3; --rhscnt)
khz /= 10;
// scale up (e.g. we got xxxx.yy--bogomips does this)
for (; rhscnt < 3; ++rhscnt)
khz *= 10;
zprt(ZPXHOWSETUP,"_systvinitkhz: EXIT khz=%d\n",khz);
return khz;
}
UPDATE:
Sigh. Yes.
I was using "cpu MHz" from /proc/cpuinfo prior to the introduction of processors with "speed step" technology, so I switched to "bogomips" and the algorithm was derived empirically based on that. When I derived it, I only had access to hyperthreaded machines. However, I've found an old one that is not and the SMT thing isn't valid.
However, it appears that bogomips is always 2x the [maximum] CPU speed. See http://www.clifton.nl/bogo-faq.html That hasn't always been my experience on all kernel versions over the years [IIRC, I started with 0.99.x], but it's probably a reliable assumption these days.
With "constant TSC" [which all newer processors have], denoted by constant_tsc in the flags: field in /proc/cpuinfo, the TSC rate is the maximum CPU frequency.
Originally, the only way to get the frequency information was from /proc/cpuinfo. Now, however, in more modern kernels, there is another way that may be easier and more definitive [I had code coverage for this in other software of mine, but had forgotten about it]:
/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq
The contents of this file is the maximum CPU frequency in kHz. There are analogous files for the other CPU cores. The files should be identical for most sane motherboards (e.g. ones that are composed of the same model chip and don't try to mix [say] i7s and atoms). Otherwise, you'd have to keep track of the info on a per-core basis and that would get messy fast.
The given directory also has other interesting files. For example, if your processor has "speed step" [and some of the other files can tell you that], you can force maximum performance by writing performance to the scaling_governor file. This will disable use of speed step.
If the processor did not have constant_tsc, you'd have to disable speed step [and run the cores at maximum rate] to get accurate measurements