|
---# |
|
arch/x86/kernel/process.c:119 |
|
--- |
|
static int set_new_tls(struct task_struct *p, unsigned long tls) |
|
{ |
|
struct user_desc __user *utls = (struct user_desc __user *)tls; |
|
|
|
if (in_ia32_syscall()) ## x ## |
|
return do_set_thread_area(p, -1, utls, 0); |
|
else |
|
return do_set_thread_area_64(p, ARCH_SET_FS, tls); |
|
} |
|
--- |
|
`in_ia32_syscall()` check if the current process is in a 32-bit syscall. |
|
if so, it will use `do_set_thread_area()` to set the `tls` (Thread Local Storage). |
|
Otherwise, it will use `do_set_thread_area_64()` to set the `tls`. |
|
---# |
|
arch/x86/kernel/cpu/mce/core.c:1519 |
|
--- |
|
static void mce_timer_fn(struct timer_list *t) |
|
{ |
|
struct timer_list *cpu_t = this_cpu_ptr(&mce_timer); |
|
unsigned long iv; |
|
|
|
WARN_ON(cpu_t != t); |
|
|
|
iv = __this_cpu_read(mce_next_interval); |
|
|
|
if (mce_available(this_cpu_ptr(&cpu_info))) { |
|
machine_check_poll(0, this_cpu_ptr(&mce_poll_banks)); |
|
|
|
if (mce_intel_cmci_poll()) { |
|
iv = mce_adjust_timer(iv); |
|
goto done; |
|
} |
|
} |
|
|
|
/* |
|
* Alert userspace if needed. If we logged an MCE, reduce the polling |
|
* interval, otherwise increase the polling interval. |
|
*/ |
|
if (mce_notify_irq()) |
|
iv = max(iv / 2, (unsigned long) HZ/100); ## x ## |
|
else |
|
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); |
|
|
|
done: |
|
__this_cpu_write(mce_next_interval, iv); |
|
__start_timer(t, iv); |
|
} |
|
--- |
|
`mce_notify_irq()` will check a MCE (Machine Check Exception) happened. If so |
|
it will increase the polling interval. Otherwise, it will decrease the polling |
|
interval. |
|
---# |
|
arch/x86/kernel/hpet.c:699 |
|
--- |
|
static u64 read_hpet(struct clocksource *cs) |
|
{ |
|
unsigned long flags; |
|
union hpet_lock old, new; |
|
|
|
BUILD_BUG_ON(sizeof(union hpet_lock) != 8); |
|
|
|
/* |
|
* Read HPET directly if in NMI. |
|
*/ |
|
if (in_nmi()) |
|
return (u64)hpet_readl(HPET_COUNTER); |
|
|
|
/* |
|
* Read the current state of the lock and HPET value atomically. |
|
*/ |
|
old.lockval = READ_ONCE(hpet.lockval); ## x ## |
|
|
|
if (arch_spin_is_locked(&old.lock)) |
|
goto contended; |
|
|
|
local_irq_save(flags); |
|
if (arch_spin_trylock(&hpet.lock)) { |
|
new.value = hpet_readl(HPET_COUNTER); |
|
/* |
|
* Use WRITE_ONCE() to prevent store tearing. |
|
*/ |
|
WRITE_ONCE(hpet.value, new.value); |
|
arch_spin_unlock(&hpet.lock); |
|
local_irq_restore(flags); |
|
return (u64)new.value; |
|
} |
|
local_irq_restore(flags); |
|
|
|
contended: |
|
/* |
|
* Contended case |
|
* -------------- |
|
* Wait until the HPET value change or the lock is free to indicate |
|
* its value is up-to-date. |
|
* |
|
* It is possible that old.value has already contained the latest |
|
* HPET value while the lock holder was in the process of releasing |
|
* the lock. Checking for lock state change will enable us to return |
|
* the value immediately instead of waiting for the next HPET reader |
|
* to come along. |
|
*/ |
|
do { |
|
cpu_relax(); |
|
new.lockval = READ_ONCE(hpet.lockval); |
|
} while ((new.value == old.value) && arch_spin_is_locked(&new.lock)); |
|
|
|
return (u64)new.value; |
|
} |
|
--- |
|
`READ_ONCE()` is a macro to read a variable without any compiler optimization. |
|
https://www.kernel.org/doc/Documentation/memory-barriers.txt |
|
If `in_nmi` is true, it will read the HPET directly. Otherwise, it will |
|
read the HPET value atomically by using `arch_spin_trylock()`. |
|
---# |
|
arch/x86/kernel/process.h:26 |
|
--- |
|
/* |
|
* This needs to be inline to optimize for the common case where no extra |
|
* work needs to be done. |
|
*/ |
|
static inline void switch_to_extra(struct task_struct *prev, |
|
struct task_struct *next) |
|
{ |
|
unsigned long next_tif = task_thread_info(next)->flags; |
|
unsigned long prev_tif = task_thread_info(prev)->flags; |
|
|
|
if (IS_ENABLED(CONFIG_SMP)) { |
|
/* |
|
* Avoid __switch_to_xtra() invocation when conditional |
|
* STIBP is disabled and the only different bit is |
|
* TIF_SPEC_IB. For CONFIG_SMP=n TIF_SPEC_IB is not |
|
* in the TIF_WORK_CTXSW masks. |
|
*/ |
|
if (!static_branch_likely(&switch_to_cond_stibp)) { ## x ## |
|
prev_tif &= ~_TIF_SPEC_IB; |
|
next_tif &= ~_TIF_SPEC_IB; |
|
} |
|
} |
|
|
|
/* |
|
* __switch_to_xtra() handles debug registers, i/o bitmaps, |
|
* speculation mitigations etc. |
|
*/ |
|
if (unlikely(next_tif & _TIF_WORK_CTXSW_NEXT || |
|
prev_tif & _TIF_WORK_CTXSW_PREV)) |
|
__switch_to_xtra(prev, next); |
|
} |
|
--- |
|
`static_branch_likely()` is a macro to check if a static branch is enabled. |
|
If not, it sets the speculation to inverse of `TIF_SPEC_IB` |
|
(Indirect Branch Speculation). |
|
---# |
|
arch/x86/kernel/process_64.c:213 |
|
--- |
|
/* |
|
* Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are |
|
* not available. The goal is to be reasonably fast on non-FSGSBASE systems. |
|
* It's forcibly inlined because it'll generate better code and this function |
|
* is hot. |
|
*/ |
|
static __always_inline void save_base_legacy(struct task_struct *prev_p, |
|
unsigned short selector, |
|
enum which_selector which) |
|
{ |
|
if (likely(selector == 0)) { ## x ## |
|
/* |
|
* On Intel (without X86_BUG_NULL_SEG), the segment base could |
|
* be the pre-existing saved base or it could be zero. On AMD |
|
* (with X86_BUG_NULL_SEG), the segment base could be almost |
|
* anything. |
|
* |
|
* This branch is very hot (it's hit twice on almost every |
|
* context switch between 64-bit programs), and avoiding |
|
* the RDMSR helps a lot, so we just assume that whatever |
|
* value is already saved is correct. This matches historical |
|
* Linux behavior, so it won't break existing applications. |
|
* |
|
* To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we |
|
* report that the base is zero, it needs to actually be zero: |
|
* see the corresponding logic in load_seg_legacy. |
|
*/ |
|
} else { |
|
/* |
|
* If the selector is 1, 2, or 3, then the base is zero on |
|
* !X86_BUG_NULL_SEG CPUs and could be anything on |
|
* X86_BUG_NULL_SEG CPUs. In the latter case, Linux |
|
* has never attempted to preserve the base across context |
|
* switches. |
|
* |
|
* If selector > 3, then it refers to a real segment, and |
|
* saving the base isn't necessary. |
|
*/ |
|
if (which == FS) |
|
prev_p->thread.fsbase = 0; |
|
else |
|
prev_p->thread.gsbase = 0; |
|
} |
|
} |
|
--- |
|
this is related to branch prediction, and setting up the FS, GS base registers. |
|
if `selector` is likely to be 0, then noting happens |
|
otherwise, it will set either the `fsbase` or `gsbase` to 0. |
|
---# |
|
arch/x86/kernel/process_64.c:629 |
|
--- |
|
/* |
|
* switch_to(x,y) should switch tasks from x to y. |
|
* |
|
* This could still be optimized: |
|
* - fold all the options into a flag word and test it with a single test. |
|
* - could test fs/gs bitsliced |
|
* |
|
* Kprobes not supported here. Set the probe on schedule instead. |
|
* Function graph tracer not supported too. |
|
*/ |
|
__visible __notrace_funcgraph struct task_struct * |
|
__switch_to(struct task_struct *prev_p, struct task_struct *next_p) |
|
{ |
|
struct thread_struct *prev = &prev_p->thread; |
|
struct thread_struct *next = &next_p->thread; |
|
struct fpu *prev_fpu = &prev->fpu; |
|
struct fpu *next_fpu = &next->fpu; |
|
int cpu = smp_processor_id(); |
|
|
|
# ... |
|
|
|
/* |
|
* Switch the PDA and FPU contexts. |
|
*/ |
|
this_cpu_write(current_task, next_p); |
|
this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); |
|
|
|
switch_fpu_finish(next_fpu); |
|
|
|
/* Reload sp0. */ |
|
update_task_stack(next_p); |
|
|
|
switch_to_extra(prev_p, next_p); |
|
|
|
if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { |
|
/* |
|
* AMD CPUs have a misfeature: SYSRET sets the SS selector but |
|
* does not update the cached descriptor. As a result, if we |
|
* do SYSRET while SS is NULL, we'll end up in user mode with |
|
* SS apparently equal to __USER_DS but actually unusable. |
|
* |
|
* The straightforward workaround would be to fix it up just |
|
* before SYSRET, but that would slow down the system call |
|
* fast paths. Instead, we ensure that SS is never NULL in |
|
* system call context. We do this by replacing NULL SS |
|
* selectors at every context switch. SYSCALL sets up a valid |
|
* SS, so the only way to get NULL is to re-enter the kernel |
|
* from CPL 3 through an interrupt. Since that can't happen |
|
* in the same task as a running syscall, we are guaranteed to |
|
* context switch between every interrupt vector entry and a |
|
* subsequent SYSRET. |
|
* |
|
* We read SS first because SS reads are much faster than |
|
* writes. Out of caution, we force SS to __KERNEL_DS even if |
|
* it previously had a different non-NULL value. |
|
*/ |
|
unsigned short ss_sel; |
|
savesegment(ss, ss_sel); |
|
if (ss_sel != __KERNEL_DS) ## x ## |
|
loadsegment(ss, __KERNEL_DS); |
|
} |
|
|
|
/* Load the Intel cache allocation PQR MSR. */ |
|
resctrl_sched_in(); |
|
|
|
return prev_p; |
|
} |
|
--- |
|
`switch_to` is the function that switches from one task to another. this line is |
|
a fix for a known bug in AMD CPUs. SS reads are much faster than writes. Out of |
|
caution, SS is forced to __KERNEL_DS even if it previously had a different |
|
non-NULL value. |
|
---# |
|
arch/x86/kernel/signal.c:91 |
|
--- |
|
static int restore_sigcontext(struct pt_regs *regs, |
|
struct sigcontext __user *usc, |
|
unsigned long uc_flags) |
|
{ |
|
struct sigcontext sc; |
|
|
|
/* Always make any pending restarted system calls return -EINTR */ |
|
current->restart_block.fn = do_no_restart_syscall; |
|
|
|
if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE)) ## x ## |
|
return -EFAULT; |
|
|
|
#... |
|
|
|
/* Get CS/SS and force CPL3 */ |
|
regs->cs = sc.cs | 0x03; |
|
regs->ss = sc.ss | 0x03; |
|
|
|
regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS); |
|
/* disable syscall checks */ |
|
regs->orig_ax = -1; |
|
|
|
#ifdef CONFIG_X86_64 |
|
/* |
|
* Fix up SS if needed for the benefit of old DOSEMU and |
|
* CRIU. |
|
*/ |
|
if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs))) |
|
force_valid_ss(regs); |
|
#endif |
|
|
|
return fpu__restore_sig((void __user *)sc.fpstate, |
|
IS_ENABLED(CONFIG_X86_32)); |
|
} |
|
--- |
|
`restore_sigcontext` is the function that restores the context of a signal. |
|
`copy_from_user` here copies `CONTEXT_COPY_SIZE` bytes from the user space to |
|
the kernel space. `CONTEXT_COPY_SIZE` is defined as `sizeof(struct sigcontext)`, |
|
returns -EFAULT on failure. |
|
---# |
|
arch/x86/kernel/signal.c:469 |
|
--- |
|
static int __setup_rt_frame(int sig, struct ksignal *ksig, |
|
sigset_t *set, struct pt_regs *regs) |
|
{ |
|
struct rt_sigframe __user *frame; |
|
void __user *fp = NULL; |
|
unsigned long uc_flags; |
|
|
|
/* x86-64 should always use SA_RESTORER. */ |
|
if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) |
|
return -EFAULT; |
|
|
|
frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp); |
|
uc_flags = frame_uc_flags(regs); |
|
|
|
if (!user_access_begin(frame, sizeof(*frame))) |
|
return -EFAULT; |
|
|
|
/* Create the ucontext. */ |
|
unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); |
|
unsafe_put_user(0, &frame->uc.uc_link, Efault); |
|
unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); |
|
|
|
/* Set up to return from userspace. If provided, use a stub |
|
already in userspace. */ |
|
unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault); |
|
unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); |
|
unsafe_put_sigmask(set, frame, Efault); |
|
user_access_end(); |
|
|
|
if (ksig->ka.sa.sa_flags & SA_SIGINFO) { |
|
if (copy_siginfo_to_user(&frame->info, &ksig->info)) ## x ## |
|
return -EFAULT; |
|
} |
|
|
|
/* Set up registers for signal handler */ |
|
regs->di = sig; |
|
/* In case the signal handler was declared without prototypes */ |
|
regs->ax = 0; |
|
|
|
/* This also works for non SA_SIGINFO handlers because they expect the |
|
next argument after the signal number on the stack. */ |
|
regs->si = (unsigned long)&frame->info; |
|
regs->dx = (unsigned long)&frame->uc; |
|
regs->ip = (unsigned long) ksig->ka.sa.sa_handler; |
|
|
|
regs->sp = (unsigned long)frame; |
|
|
|
/* |
|
* Set up the CS and SS registers to run signal handlers in |
|
* 64-bit mode, even if the handler happens to be interrupting |
|
* 32-bit or 16-bit code. |
|
* |
|
* SS is subtle. In 64-bit mode, we don't need any particular |
|
* SS descriptor, but we do need SS to be valid. It's possible |
|
* that the old SS is entirely bogus -- this can happen if the |
|
* signal we're trying to deliver is #GP or #SS caused by a bad |
|
* SS value. We also have a compatbility issue here: DOSEMU |
|
* relies on the contents of the SS register indicating the |
|
* SS value at the time of the signal, even though that code in |
|
* DOSEMU predates sigreturn's ability to restore SS. (DOSEMU |
|
* avoids relying on sigreturn to restore SS; instead it uses |
|
* a trampoline.) So we do our best: if the old SS was valid, |
|
* we keep it. Otherwise we replace it. |
|
*/ |
|
regs->cs = __USER_CS; |
|
|
|
if (unlikely(regs->ss != __USER_DS)) |
|
force_valid_ss(regs); |
|
|
|
return 0; |
|
|
|
Efault: |
|
user_access_end(); |
|
return -EFAULT; |
|
} |
|
--- |
|
`__setup_rt_frame` is the function that sets up the frame for a signal. |
|
`copy_siginfo_to_user` here copies `sizeof(struct siginfo)` bytes from the |
|
kernel space to the user space. `sizeof(struct siginfo)` returns -EFAULT on |
|
failure. |
|
---# |
|
arch/x86/lib/insn.c:156 |
|
--- |
|
/** |
|
* insn_get_prefixes - scan x86 instruction prefix bytes |
|
* @insn: &struct insn containing instruction |
|
* |
|
* Populates the @insn->prefixes bitmap, and updates @insn->next_byte |
|
* to point to the (first) opcode. No effect if @insn->prefixes.got |
|
* is already set. |
|
*/ |
|
void insn_get_prefixes(struct insn *insn) |
|
{ |
|
struct insn_field *prefixes = &insn->prefixes; |
|
insn_attr_t attr; |
|
insn_byte_t b, lb; |
|
int i, nb; |
|
|
|
if (prefixes->got) |
|
return; |
|
|
|
# ... |
|
/* Decode REX prefix */ |
|
if (insn->x86_64) { ## x ## |
|
b = peek_next(insn_byte_t, insn); |
|
attr = inat_get_opcode_attribute(b); |
|
if (inat_is_rex_prefix(attr)) { |
|
insn->rex_prefix.value = b; |
|
insn->rex_prefix.nbytes = 1; |
|
insn->next_byte++; |
|
if (X86_REX_W(b)) |
|
/* REX.W overrides opnd_size */ |
|
insn->opnd_bytes = 8; |
|
} |
|
} |
|
insn->rex_prefix.got = 1; |
|
|
|
# ... |
|
vex_end: |
|
insn->vex_prefix.got = 1; |
|
|
|
prefixes->got = 1; |
|
|
|
err_out: |
|
return; |
|
} |
|
--- |
|
`insn_get_prefixes` is the function that gets the prefixes of an instruction. |
|
if `insn->x86_64` is true, `insn_get_prefixes` will try to get the REX prefix |
|
of the instruction. `inat_get_opcode_attribute` returns the attribute of the |
|
byte. `inat_is_rex_prefix` checks if the attribute is a REX prefix. `X86_REX_W` |
|
checks if the REX prefix is a 64-bit REX prefix. `insn->opnd_bytes` is the size |
|
of the operand. `insn->opnd_bytes = 8` sets the operand size to 64-bit. |
|
---# |
|
arch/x86/mm/fault.c:1101 |
|
--- |
|
static inline int |
|
access_error(unsigned long error_code, struct vm_area_struct *vma) |
|
{ |
|
/* This is only called for the current mm, so: */ |
|
bool foreign = false; |
|
|
|
/* |
|
* Read or write was blocked by protection keys. This is |
|
* always an unconditional error and can never result in |
|
* a follow-up action to resolve the fault, like a COW. |
|
*/ |
|
if (error_code & X86_PF_PK) ## x ## |
|
return 1; |
|
# ... |
|
return 0; |
|
} |
|
--- |
|
`access_error` is the function that checks if the access is allowed. If the |
|
access is not allowed, `access_error` returns 1. `error_code` is the error |
|
code of the fault. `X86_PF_PK` is the bit that indicates if the access is |
|
blocked by protection keys. |
|
---# |
|
arch/x86/mm/fault.c:1121 |
|
--- |
|
static inline int |
|
access_error(unsigned long error_code, struct vm_area_struct *vma) |
|
{ |
|
/* This is only called for the current mm, so: */ |
|
bool foreign = false; |
|
# ... |
|
/* read, present: */ |
|
if (unlikely(error_code & X86_PF_PROT)) |
|
return 1; |
|
|
|
/* read, not present: */ |
|
if (unlikely(!vma_is_accessible(vma))) |
|
return 1; |
|
|
|
return 0; |
|
} |
|
--- |
|
`access_error` is the function that checks if the access is allowed. If the |
|
access is not allowed, `access_error` returns 1. `error_code` is the error |
|
code of the fault. `X86_PF_PROT` is the bit that indicates if the access is |
|
blocked by protection keys. |
|
---# |
|
arch/x86/mm/fault.c:1131 |
|
--- |
|
bool fault_in_kernel_space(unsigned long address) |
|
{ |
|
/* |
|
* On 64-bit systems, the vsyscall page is at an address above |
|
* TASK_SIZE_MAX, but is not considered part of the kernel |
|
* address space. |
|
*/ |
|
if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address)) ## x ## |
|
return false; |
|
|
|
return address >= TASK_SIZE_MAX; |
|
} |
|
--- |
|
`fault_in_kernel_space` is the function that checks if the fault is in the |
|
kernel space. `TASK_SIZE_MAX` is the maximum address of the user space. |
|
In this line `is_vsyscall_vaddr` checks if the fault is in the vsyscall (a page |
|
that contains the system call instructions) page and returns false if it is. |
|
---# |
|
arch/x86/mm/fault.c:1340 |
|
--- |
|
* Handle faults in the user portion of the address space */ |
|
static inline |
|
void do_user_addr_fault(struct pt_regs *regs, |
|
unsigned long hw_error_code, |
|
unsigned long address) |
|
{ |
|
struct vm_area_struct *vma; |
|
struct task_struct *tsk; |
|
struct mm_struct *mm; |
|
vm_fault_t fault; |
|
unsigned int flags = FAULT_FLAG_DEFAULT; |
|
|
|
tsk = current; |
|
mm = tsk->mm; |
|
# ... |
|
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { ## x ## |
|
bad_area(regs, hw_error_code, address); |
|
return; |
|
} |
|
# ... |
|
check_v8086_mode(regs, address, tsk); |
|
} |
|
--- |
|
`do_user_addr_fault` is the function that handles the fault in the user space. |
|
`VM_GROWSDOWN` is the flag that indicates if the stack grows down. If the |
|
stack does not grow down, `bad_area` is called. |
|
---# |
|
arch/x86/mm/pat/memtype.c:1085 |
|
--- |
|
/* |
|
* untrack_pfn is called while unmapping a pfnmap for a region. |
|
* untrack can be called for a specific region indicated by pfn and size or |
|
* can be for the entire vma (in which case pfn, size are zero). |
|
*/ |
|
void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, |
|
unsigned long size) |
|
{ |
|
resource_size_t paddr; |
|
unsigned long prot; |
|
|
|
if (vma && !(vma->vm_flags & VM_PAT)) ## x ## |
|
return; |
|
|
|
/* free the chunk starting from pfn or the whole chunk */ |
|
paddr = (resource_size_t)pfn << PAGE_SHIFT; |
|
if (!paddr && !size) { |
|
if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { |
|
WARN_ON_ONCE(1); |
|
return; |
|
} |
|
|
|
size = vma->vm_end - vma->vm_start; |
|
} |
|
free_pfn_range(paddr, size); |
|
if (vma) |
|
vma->vm_flags &= ~VM_PAT; |
|
} |
|
--- |
|
`untrack_pfn` is the function that untracks the pfn (physical frame number). |
|
`VM_PAT` is the flag that indicates if the page is tracked. If the page is not |
|
tracked, `untrack_pfn` returns. |
|
---# |
|
arch/x86/mm/tlb.c:559 |
|
--- |
|
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, |
|
struct task_struct *tsk) |
|
{ |
|
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); |
|
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); |
|
bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy); |
|
unsigned cpu = smp_processor_id(); |
|
u64 next_tlb_gen; |
|
bool need_flush; |
|
u16 new_asid; |
|
# ... |
|
|
|
if (need_flush) { ## x ## |
|
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); |
|
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); |
|
load_new_mm_cr3(next->pgd, new_asid, true); |
|
|
|
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); |
|
} else { |
|
/* The new ASID is already up to date. */ |
|
load_new_mm_cr3(next->pgd, new_asid, false); |
|
|
|
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); |
|
} |
|
|
|
/* Make sure we write CR3 before loaded_mm. */ |
|
barrier(); |
|
|
|
this_cpu_write(cpu_tlbstate.loaded_mm, next); |
|
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); |
|
|
|
if (next != real_prev) { |
|
cr4_update_pce_mm(next); |
|
switch_ldt(real_prev, next); |
|
} |
|
} |
|
--- |
|
`switch_mm_irqs_off` is the function that switches the mm. `need_flush` is the |
|
variable that indicates if the TLB needs to be flushed. If `need_flush` is |
|
true, the TLB is flushed. |
|
---# |
|
arch/x86/mm/tlb.c:598 |
|
--- |
|
/* |
|
* Please ignore the name of this function. It should be called |
|
* switch_to_kernel_thread(). |
|
* |
|
* enter_lazy_tlb() is a hint from the scheduler that we are entering a |
|
* kernel thread or other context without an mm. Acceptable implementations |
|
* include doing nothing whatsoever, switching to init_mm, or various clever |
|
* lazy tricks to try to minimize TLB flushes. |
|
* |
|
* The scheduler reserves the right to call enter_lazy_tlb() several times |
|
* in a row. It will notify us that we're going back to a real mm by |
|
* calling switch_mm_irqs_off(). |
|
*/ |
|
void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) |
|
{ |
|
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) ## x ## |
|
return; |
|
|
|
this_cpu_write(cpu_tlbstate.is_lazy, true); |
|
} |
|
--- |
|
this functions is called when the scheduler enters a kernel thread without a |
|
mm. new mm is set to `init_mm` and returns. Else `cpu_tlbstate.is_lazy` is set |
|
to true. |
|
---# |
|
arch/x86/mm/tlb.c:818 |
|
--- |
|
STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask, |
|
const struct flush_tlb_info *info) |
|
{ |
|
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); |
|
if (info->end == TLB_FLUSH_ALL) |
|
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); |
|
else |
|
trace_tlb_flush(TLB_REMOTE_SEND_IPI, |
|
(info->end - info->start) >> PAGE_SHIFT); |
|
|
|
/* |
|
* If no page tables were freed, we can skip sending IPIs to |
|
* CPUs in lazy TLB mode. They will flush the CPU themselves |
|
* at the next context switch. |
|
* |
|
* However, if page tables are getting freed, we need to send the |
|
* IPI everywhere, to prevent CPUs in lazy TLB mode from tripping |
|
* up on the new contents of what used to be page tables, while |
|
* doing a speculative memory access. |
|
*/ |
|
if (info->freed_tables) ## x ## |
|
smp_call_function_many(cpumask, flush_tlb_func_remote, |
|
(void *)info, 1); |
|
else |
|
on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote, |
|
(void *)info, 1, cpumask); |
|
} |
|
--- |
|
`native_flush_tlb_others` is the function that flushes the TLB of other CPUs. |
|
If `info->freed_tables` is true, the TLB of all CPUs is flushed. Else, the |
|
TLB of CPUs that are not in lazy mode is flushed. |
|
---# |
|
block/bio.c:225 |
|
--- |
|
struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, |
|
mempool_t *pool) |
|
{ |
|
struct bio_vec *bvl; |
|
# ... |
|
|
|
/* |
|
* Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM |
|
* is set, retry with the 1-entry mempool |
|
*/ |
|
bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); |
|
if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { ## x ## |
|
*idx = BVEC_POOL_MAX; |
|
goto fallback; |
|
} |
|
} |
|
|
|
(*idx)++; |
|
return bvl; |
|
} |
|
--- |
|
`bvec_alloc` is the function that allocates a bio_vec (a bio_vec is a vector of |
|
pages). ref:http://books.gigatux.nl/mirror/kerneldevelopment/0672327201/ch13lev1sec3.html |
|
If the allocation fails and `__GFP_DIRECT_RECLAIM` is set, the allocation is |
|
retried with the 1-entry mempool. |
|
---# |
|
/block/bio.c:503 |
|
--- |
|
|
|
/** |
|
* bio_alloc_bioset - allocate a bio for I/O |
|
* @gfp_mask: the GFP_* mask given to the slab allocator |
|
* @nr_iovecs: number of iovecs to pre-allocate |
|
* @bs: the bio_set to allocate from. |
|
* |
|
* ... |
|
*/ |
|
|
|
struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) |
|
{ |
|
struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, |
|
struct bio_set *bs) |
|
{ |
|
gfp_t saved_gfp = gfp_mask; |
|
unsigned front_pad; |
|
unsigned inline_vecs; |
|
struct bio_vec *bvl = NULL; |
|
struct bio *bio; |
|
void *p; |
|
|
|
# ... |
|
|
|
if (nr_iovecs > inline_vecs) { ## x ## |
|
unsigned long idx = 0; |
|
|
|
bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); |
|
if (!bvl && gfp_mask != saved_gfp) { |
|
punt_bios_to_rescuer(bs); |
|
gfp_mask = saved_gfp; |
|
bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); |
|
} |
|
|
|
if (unlikely(!bvl)) |
|
goto err_free; |
|
|
|
bio->bi_flags |= idx << BVEC_POOL_OFFSET; |
|
} else if (nr_iovecs) { |
|
bvl = bio->bi_inline_vecs; |
|
} |
|
|
|
|
|
bio->bi_pool = bs; |
|
bio->bi_max_vecs = nr_iovecs; |
|
bio->bi_io_vec = bvl; |
|
return bio; |
|
|
|
err_free: |
|
mempool_free(p, &bs->bio_pool); |
|
return NULL; |
|
} |
|
--- |
|
`bio_alloc_bioset` is the function that allocates a bio. If `nr_iovecs` is |
|
greater than `inline_vecs`, the allocation is retried with the 1-entry mempool. |
|
Else if `nr_iovecs` is not 0, `bvl` is set to `bio->bi_inline_vecs` (a vector of |
|
pages). |
|
---# |
|
block/bio.c:880 |
|
--- |
|
/** |
|
* __bio_try_merge_page - try appending data to an existing bvec. |
|
* @bio: destination bio |
|
* @page: start page to add |
|
* @len: length of the data to add |
|
* @off: offset of the data relative to @page |
|
* @same_page: return if the segment has been merged inside the same page |
|
* |
|
* Try to add the data at @page + @off to the last bvec of @bio. This is a |
|
* useful optimisation for file systems with a block size smaller than the |
|
* page size. |
|
* |
|
* Warn if (@len, @off) crosses pages in case that @same_page is true. |
|
* |
|
* Return %true on success or %false on failure. |
|
*/ |
|
bool __bio_try_merge_page(struct bio *bio, struct page *page, |
|
unsigned int len, unsigned int off, bool *same_page) |
|
{ |
|
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) |
|
return false; |
|
|
|
if (bio->bi_vcnt > 0) { |
|
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; |
|
|
|
if (page_is_mergeable(bv, page, len, off, same_page)) { |
|
if (bio->bi_iter.bi_size > UINT_MAX - len) { ## x ## |
|
*same_page = false; |
|
return false; |
|
} |
|
bv->bv_len += len; |
|
bio->bi_iter.bi_size += len; |
|
return true; |
|
} |
|
} |
|
return false; |
|
} |
|
--- |
|
`__bio_try_merge_page` is the function that tries to append data to an existing |
|
bvec. If `bio->bi_vcnt` is greater than 0, the last bvec of `bio` is retrieved |
|
and if the page is mergeable, the length of the bvec is increased by `len` and |
|
the size of the bio is increased by `len`. If the size of the bio is greater |
|
than `UINT_MAX - len`, the page is not merged. |
|
---# |
|
block/bio.c:918 |
|
--- |
|
/** |
|
* __bio_add_page - add page(s) to a bio in a new segment |
|
* @bio: destination bio |
|
* @page: start page to add |
|
* @len: length of the data to add, may cross pages |
|
* @off: offset of the data relative to @page, may cross pages |
|
* |
|
* Add the data at @page + @off to @bio as a new bvec. The caller must ensure |
|
* that @bio has space for another bvec. |
|
*/ |
|
void __bio_add_page(struct bio *bio, struct page *page, |
|
unsigned int len, unsigned int off) |
|
{ |
|
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; |
|
|
|
WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); |
|
WARN_ON_ONCE(bio_full(bio, len)); |
|
|
|
bv->bv_page = page; |
|
bv->bv_offset = off; |
|
bv->bv_len = len; |
|
|
|
bio->bi_iter.bi_size += len; |
|
bio->bi_vcnt++; |
|
|
|
if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page))) ## x ## |
|
bio_set_flag(bio, BIO_WORKINGSET); |
|
} |
|
EXPORT_SYMBOL_GPL(__bio_add_page); |
|
--- |
|
`__bio_add_page` is the function that adds a page to a bio in a new segment. The |
|
caller must ensure that the bio has space for another bvec. The page is added |
|
to the bio and the size of the bio is increased by `len`. If the page is in the |
|
workingset, the bio is flagged as being in the workingset (a workingset is a |
|
set of pages that are frequently accessed). |
|
---# |
|
block/blk-core.c:832 |
|
--- |
|
static noinline_for_stack bool submit_bio_checks(struct bio *bio) |
|
{ |
|
struct request_queue *q = bio->bi_disk->queue; |
|
blk_status_t status = BLK_STS_IOERR; |
|
struct blk_plug *plug; |
|
|
|
might_sleep(); |
|
|
|
plug = blk_mq_plug(q, bio); |
|
if (plug && plug->nowait) |
|
bio->bi_opf |= REQ_NOWAIT; |
|
|
|
/* |
|
* For a REQ_NOWAIT based request, return -EOPNOTSUPP |
|
* if queue does not support NOWAIT. |
|
*/ |
|
if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q)) |
|
goto not_supported; |
|
|
|
if (should_fail_bio(bio)) |
|
goto end_io; |
|
|
|
if (bio->bi_partno) { |
|
if (unlikely(blk_partition_remap(bio))) |
|
goto end_io; |
|
} else { |
|
if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0))) ## x ## |
|
goto end_io; |
|
if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk)))) |
|
goto end_io; |
|
} |
|
|
|
# ... |
|
|
|
not_supported: |
|
status = BLK_STS_NOTSUPP; |
|
end_io: |
|
bio->bi_status = status; |
|
bio_endio(bio); |
|
return false; |
|
} |
|
--- |
|
`submit_bio_checks` is the function that checks if a bio can be submitted. If |
|
the bio is read-only, the bio is ended and `false` is returned. |
|
---# |
|
block/blk-core.c:1269 |
|
--- |
|
static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end) |
|
{ |
|
unsigned long stamp; |
|
again: |
|
stamp = READ_ONCE(part->stamp); |
|
if (unlikely(stamp != now)) { ## x ## |
|
if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) |
|
__part_stat_add(part, io_ticks, end ? now - stamp : 1); |
|
} |
|
if (part->partno) { |
|
part = &part_to_disk(part)->part0; |
|
goto again; |
|
} |
|
} |
|
--- |
|
`update_io_ticks` is the function that updates the io ticks of a partition. |
|
`part->stamp` is the timestamp of the last io operation. If `part->stamp` is |
|
different than `now`, `part->stamp` is updated to `now` and the io ticks of |
|
`part` are increased by 1. If `end` is true, the io ticks of `part` are |
|
increased by `now - stamp`. |
|
---# |
|
block/blk-core.c:1272 |
|
--- |
|
static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end) |
|
{ |
|
unsigned long stamp; |
|
again: |
|
stamp = READ_ONCE(part->stamp); |
|
if (unlikely(stamp != now)) { |
|
if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) |
|
__part_stat_add(part, io_ticks, end ? now - stamp : 1); |
|
} |
|
if (part->partno) { ## x ## |
|
part = &part_to_disk(part)->part0; |
|
goto again; |
|
} |
|
} |
|
--- |
|
`update_io_ticks` is the function that updates the io ticks of a partition. |
|
`part->partno` is the partition number. If `part->partno` is not 0, the |
|
partition of `part` is retrieved and the function is called again. |
|
---# |
|
block/blk-core.c:1775 |
|
--- |
|
/** |
|
* blk_finish_plug - mark the end of a batch of submitted I/O |
|
* @plug: The &struct blk_plug passed to blk_start_plug() |
|
* |
|
* Description: |
|
* Indicate that a batch of I/O submissions is complete. This function |
|
* must be paired with an initial call to blk_start_plug(). The intent |
|
* is to allow the block layer to optimize I/O submission. See the |
|
* documentation for blk_start_plug() for more information. |
|
*/ |
|
void blk_finish_plug(struct blk_plug *plug) |
|
{ |
|
if (plug != current->plug) ## x ## |
|
return; |
|
blk_flush_plug_list(plug, false); |
|
|
|
current->plug = NULL; |
|
} |
|
--- |
|
`blk_finish_plug` is the function that marks the end of a batch of submitted |
|
io. If `plug` is not the current plug, the function returns. Else the plug list |
|
is flushed and the current plug is set to `NULL`. |
|
---# |
|
block/blk-mq-sched.c:341 |
|
--- |
|
/* |
|
* Only SCSI implements .get_budget and .put_budget, and SCSI restarts |
|
* its queue by itself in its completion handler, so we don't need to |
|
* restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. |
|
* |
|
* Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to |
|
* be run again. This is necessary to avoid starving flushes. |
|
*/ |
|
static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) |
|
{ |
|
struct request_queue *q = hctx->queue; |
|
struct elevator_queue *e = q->elevator; |
|
bool multi_hctxs = false, run_queue = false; |
|
bool dispatched = false, busy = false; |
|
unsigned int max_dispatch; |
|
LIST_HEAD(rq_list); |
|
int count = 0; |
|
# ... |
|
if (busy) ## x ## |
|
return -EAGAIN; |
|
return !!dispatched; |
|
} |
|
--- |
|
`__blk_mq_do_dispatch_sched` is the function that dispatches requests from the |
|
scheduler. If the hctx (hardware context) is busy, -EAGAIN is returned. Else 0 |
|
is returned. |
|
---# |
|
block/blk-mq.c:605 |
|
--- |
|
static void blk_mq_trigger_softirq(struct request *rq) |
|
{ |
|
struct list_head *list; |
|
unsigned long flags; |
|
|
|
local_irq_save(flags); |
|
list = this_cpu_ptr(&blk_cpu_done); |
|
list_add_tail(&rq->ipi_list, list); |
|
|
|
/* |
|
* If the list only contains our just added request, signal a raise of |
|
* the softirq. If there are already entries there, someone already |
|
* raised the irq but it hasn't run yet. |
|
*/ |
|
if (list->next == &rq->ipi_list) |
|
raise_softirq_irqoff(BLOCK_SOFTIRQ); ## x ## |
|
local_irq_restore(flags); |
|
} |
|
--- |
|
`blk_mq_trigger_softirq` is the function that triggers the softirq. |
|
if `list->next` is equal to `&rq->ipi_list`, it means the list only contains |
|
entries added by the current cpu. In this case, the softirq is raised. |
|
Else, the softirq is already raised and it hasn't run yet. |
|
---# |
|
/block/bounce.c:377 |
|
--- |
|
void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) |
|
{ |
|
mempool_t *pool; |
|
|
|
/* |
|
* Data-less bio, nothing to bounce |
|
*/ |
|
if (!bio_has_data(*bio_orig)) |
|
return; |
|
|
|
/* |
|
* for non-isa bounce case, just check if the bounce pfn is equal |
|
* to or bigger than the highest pfn in the system -- in that case, |
|
* don't waste time iterating over bio segments |
|
*/ |
|
if (!(q->bounce_gfp & GFP_DMA)) { |
|
if (q->limits.bounce_pfn >= blk_max_pfn) ## x ## |
|
return; |
|
pool = &page_pool; |
|
} else { |
|
BUG_ON(!mempool_initialized(&isa_page_pool)); |
|
pool = &isa_page_pool; |
|
} |
|
|
|
/* |
|
* slow path |
|
*/ |
|
__blk_queue_bounce(q, bio_orig, pool); |
|
} |
|
--- |
|
`blk_queue_bounce` is the function that bounces a bio. If the bounce pfn is |
|
equal to or bigger than the highest pfn in the system, the function returns. |
|
---# |
|
fs/exec.c:449 |
|
--- |
|
** |
|
* count() counts the number of strings in array ARGV. |
|
*/ |
|
static int count(struct user_arg_ptr argv, int max) |
|
{ |
|
int i = 0; |
|
|
|
if (argv.ptr.native != NULL) { |
|
for (;;) { |
|
const char __user *p = get_user_arg_ptr(argv, i); |
|
|
|
if (!p) |
|
break; |
|
|
|
if (IS_ERR(p)) |
|
return -EFAULT; |
|
|
|
if (i >= max) |
|
return -E2BIG; |
|
++i; |
|
|
|
if (fatal_signal_pending(current)) |
|
return -ERESTARTNOHAND; |
|
cond_resched(); ## x ## |
|
} |
|
} |
|
return i; |
|
} |
|
--- |
|
`count` is the function that counts the number of strings in an array. |
|
`p` is the string at index `i` in the array. |
|
`cond_resched` is called to reschedule the current task. |
|
---# |
|
fs/exec.c:1022 |
|
--- |
|
/* |
|
* Maps the mm_struct mm into the current task struct. |
|
* On success, this function returns with the mutex |
|
* exec_update_mutex locked. |
|
*/ |
|
static int exec_mmap(struct mm_struct *mm) |
|
{ |
|
struct task_struct *tsk; |
|
struct mm_struct *old_mm, *active_mm; |
|
int ret; |
|
# ... |
|
if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) |
|
local_irq_enable(); |
|
activate_mm(active_mm, mm); |
|
if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) |
|
local_irq_enable(); |
|
tsk->mm->vmacache_seqnum = 0; ## x ## |
|
vmacache_flush(tsk); |
|
task_unlock(tsk); |
|
if (old_mm) { |
|
mmap_read_unlock(old_mm); |
|
BUG_ON(active_mm != old_mm); |
|
setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm); |
|
mm_update_next_owner(old_mm); |
|
mmput(old_mm); |
|
return 0; |
|
} |
|
mmdrop(active_mm); |
|
return 0; |
|
} |
|
--- |
|
`exec_mmap` is the function that maps the mm_struct `mm` into the current task |
|
struct. |
|
If `CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM` is enabled (which is the case for |
|
x86), `local_irq_enable` is called before `activate_mm`. |
|
---# |
|
fs/exec.c:1850 |
|
--- |
|
/* |
|
* sys_execve() executes a new program. |
|
*/ |
|
static int bprm_execve(struct linux_binprm *bprm, |
|
int fd, struct filename *filename, int flags) |
|
{ |
|
struct file *file; |
|
struct files_struct *displaced; |
|
int retval; |
|
|
|
/* |
|
* Cancel any io_uring activity across execve |
|
*/ |
|
io_uring_task_cancel(); |
|
|
|
retval = unshare_files(&displaced); |
|
if (retval) |
|
return retval; |
|
# ... |
|
out_files: |
|
if (displaced) |
|
reset_files_struct(displaced); ## x ## |
|
|
|
return retval; |
|
} |
|
--- |
|
`bprm_execve` is the function that executes a new program. |
|
`displaced` is the files struct that is displaced by the new files struct. |
|
`reset_files_struct` is called to reset the files struct. |
|
---# |