链表存储的存储结构所占的存储空间(lru链表内存回收主要内容)
链表存储的存储结构所占的存储空间(lru链表内存回收主要内容)影响所有内存回收/* 初始化内存回收控制结构体,并检测当前进程是否需要进入pgdat->pfmemalloc_wait等待队列 * 1. 初始化sc,回收数量目标是SWAP_CLUSTER_MAX? * 2. throttle_direct_reclaim:判断当前内存申请进程是否要进入等待队列 * 3. do_try_to_free_pages:触发内存回收;直到回收到目标数量page或内存压缩可以解决问题 */ unsigned long try_to_free_pages(struct zonelist *zonelist int order gfp_t gfp_mask nodemask_t *nodemask) { unsigned long nr_reclaimed; struct scan_control sc = { .nr_to_rec
内存回收主要内容(lru链表中页的内存回收)- 回收的页都是非活动匿名页lru链表或者非活动文件页lru链表上的页,包括: 进程堆、栈、匿名mmap共享内存映射、shmem共享内存映射使用的页、映射磁盘文件页
- 根本方法:将page->_count降到0;然后就根据不同的情况分别处理:干净页,并且映射了磁盘文件的页,直接回收没有进程映射,并且没有映射磁盘文件的页,直接回收文件页:脏页(PG_dirty置位),回写到对应磁盘文件中,然后回收匿名页:有进程映射,并且没有映射磁盘文件的页,回写到swap分区中,然后回收
- 触发路径:但无论哪种路径,最后的回收入口函数均为shrink_zone 都是着眼于node维度被动式:由内核线程kswapd直接调用内存回收逻辑进行回收,对每个zone从低到高的方向扫描主动式:在从伙伴系统申请内存(__alloc_pages)时进入分配逻辑时进行内存回收,从高到低扫描
内存回收分为两篇来描述,本篇主要说明触发内存回收的不同入口,以及他们在启动内存回收前的处理措施
由kswapd触发内核内存回收路径kswapd_initsetup_arch()->paging_init()->bootmem_init()->zone_sizes_init()->free_area_init_node()->free_area_init_core()->init_waitqueue_head(&pgdat->kswapd_wait)
static int __init kswapd_init(void)
{
int nid;
swap_setup();
for_each_node_state(nid N_MEMORY)
kswapd_run(nid);
hotcpu_notifier(cpu_callback 0);
return 0;
}
更大Linux内核视频教程文档资料免费领取后台私信【内核】自行获取。
内核网站:
Linux内核源码/内存调优/文件系统/进程管理/设备驱动/网络协议栈-学习视频教程-腾讯课堂
kswapd_runint kswapd_run(int nid)
{
/* 获取内存节点对应的pg_data_t指针 */
pg_data_t *pgdat = NODE_DATA(nid);
int ret = 0;
if (pgdat->kswapd)
return 0;
/* kswapd函数,pgdat作为参数传入kswapd函数 */
pgdat->kswapd = kthread_run(kswapd pgdat "kswapd%d" nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
BUG_ON(system_state == SYSTEM_BOOTING);
pr_err("Failed to start kswapd on node %d\n" nid);
ret = PTR_ERR(pgdat->kswapd);
pgdat->kswapd = NULL;
}
return ret;
}
kswapd
/*
* kswapd线程每隔一段时间就自动平衡空闲页面,从低到高扫描每个zone,使每个zone的空闲页面维持在一定水平
*
*/
static int kswapd(void *p)
{
unsigned long order new_order;
unsigned balanced_order;
int classzone_idx new_classzone_idx;
int balanced_classzone_idx;
/* 每个node一个kswapd线程,对应一个pg_data_t结构体 */
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
struct reclaim_state reclaim_state = {
.reclaimed_slab = 0
};
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
lockdep_set_current_reclaim_state(GFP_KERNEL);
if (!cpumask_empty(cpumask))
set_cpus_allowed_ptr(tsk cpumask);
current->reclaim_state = &reclaim_state;
/*
* Tell the memory management that we're a "memory allocator"
* and that if we need more memory we should get access to it
* regardless (see "__alloc_pages()"). "kswapd" should
* never get caught in the normal page freeing logic.
*
* (Kswapd normally doesn't need memory anyway but sometimes
* you need a small amount of memory in order to be able to
* page out something else and this flag essentially protects
* us from recursively trying to free more memory as we're
* trying to free the first piece of memory in the first place).
*/
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
set_freezable();
order = new_order = 0;
balanced_order = 0;
classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
balanced_classzone_idx = classzone_idx;
for ( ; ; ) {
bool ret;
/*
* If the last balance_pgdat was unsuccessful it's unlikely a
* new request of a similar or harder type will succeed soon
* so consider going to sleep on the basis we reclaimed at
*/
if (balanced_classzone_idx >= new_classzone_idx &&
balanced_order == new_order) {
new_order = pgdat->kswapd_max_order;
new_classzone_idx = pgdat->classzone_idx;
pgdat->kswapd_max_order = 0;
pgdat->classzone_idx = pgdat->nr_zones - 1;
}
if (order < new_order || classzone_idx > new_classzone_idx) {
/*
* Don't sleep if someone wants a larger 'order'
* allocation or has tigher zone constraints
*/
order = new_order;
classzone_idx = new_classzone_idx;
} else {
/* 在此处睡眠,等待wakeup_kswapd来唤醒 */
kswapd_try_to_sleep(pgdat balanced_order
balanced_classzone_idx);
/* pgdata->kswapd_max_order和pgdat->classzone_id已经在wakeup_kswapd中进行了更新 */
order = pgdat->kswapd_max_order;
classzone_idx = pgdat->classzone_idx;
new_order = order;
new_classzone_idx = classzone_idx;
pgdat->kswapd_max_order = 0;
pgdat->classzone_idx = pgdat->nr_zones - 1;
}
/* 内核进程冻结技术,详细介绍参考文献
* 1. 内核为每个进程在适当的时候调用try_to_freeze来设置PF_FREEZE标志,
* 2. 当系统要suspend的时候,系统那边会调用freeze_process函数来将所有可冷冻的任务的TIF_FREEZE标志置位
* 3. 然后所有有TIF_FREEZE标志的进程会睡眠
*/
ret = try_to_freeze();
/* 判断内核线程是否应该停止,当有人调用kthread_stop()时,此处为真 */
if (kthread_should_stop())
break;
/*
* We can speed up thawing tasks if we don't call balance_pgdat
* after returning from the refrigerator
*/
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id order);
balanced_classzone_idx = classzone_idx;
/* 启动内存回收具体任务 */
balanced_order = balance_pgdat(pgdat order
&balanced_classzone_idx);
}
}
/* 当kthread被停止,修改进程标识,具体什么意思? */
tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
current->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
return 0;
}
balance_pgdat
/* 对node中zone,从低往高进行内存释放
* 以默认优先级12,执行多次遍历node内存释放;只有当在当前优先优先级无法回收到内存情况下,才进行sc->priority--
* 1. 从高到底找出第一不平衡的zone
* 2. 循环对第一个不平衡的其下面的zone进行内存回收
检测node是否需要内存压缩:只要有一个zone能以low level分配order页框则不需要
mem_cgroup_soft_limit_reclaim:对zone内memcg中超过soft_limit_in_bytes的进程内存进行软回收
kswapd_shrink_zone->shrink_zone: 进行内存回收(如果内存平衡则不需要进行回收),回收不成功则降低回收优先级
pfmemalloc_watermark_ok判断OK,则唤醒pfmemalloc_wait队列中的等待进程
如果要进行内存压缩的话,则启动
* 本node内存balance(pgdat_balanced) || sc->priority==0 停止对node循环内存回收
*/
static unsigned long balance_pgdat(pg_data_t *pgdat int order
int *classzone_idx)
{
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL
.order = order
/* 优先级使用默认为的12,会执行多次遍历node(并不是node中的所有zone),但并不会每次遍历都进行sc->priority--,当能够回收的内存时,才进行sc->priority-- */
.priority = DEF_PRIORITY
.may_writepage = !laptop_mode
.may_unmap = 1
.may_swap = 1
};
count_vm_event(PAGEOUTRUN);
do {
unsigned long nr_attempted = 0;
bool raise_priority = true;
/* 如果要求收集的内存order>0 则在回收内存后,启动内存压缩,整理碎片 */
bool pgdat_needs_compaction = (order > 0);
sc.nr_reclaimed = 0;
/*
* 确定内存释放区的zone id上限:
* 1. 通过age_active_anon检查该zone lru链表中inactive_list中page是否足够
* 2. 为什么上限是第一个buffer_header,或不需要进行内存平衡的区域?
*/
for (i = pgdat->nr_zones - 1; i >= 0; i--) {
struct zone *zone = pgdat->node_zones i;
/* 如果zone中没有page,则跳过本zone */
if (!populated_zone(zone))
continue;
/* ?? */
if (sc.priority != DEF_PRIORITY &&
!zone_reclaimable(zone))
continue;
/* 通过inactive_anon_is_low()判断本zone的lru inactive链表中页面数量是否足够
* 如果不够,则使用shrink_active_list()释放更多active页面到inactive链表
* shrink_active_list、shrink_zone等更多的收缩流程在下一篇中介绍
*/
age_active_anon(zone &sc);
/*
* If the number of buffer_heads in the machine
* exceeds the maximum allowed level and this node
* has a highmem zone force kswapd to reclaim from
* it to relieve lowmem pressure.
*/
if (buffer_heads_over_limit && is_highmem_idx(i)) {
end_zone = i;
break;
}
if (!zone_balanced(zone order 0 0)) {
/* 找到第一个不平衡的zone,平衡条件
* 1. 此zone分配页框后剩余的页框数量 > 此zone的high阀值 此zone保留的页框数量
* 2. 申请的order阶内存,可以通过内存压缩达到
*/
end_zone = i;
break;
} else {
/*
* If balanced clear the dirty and congested
* flags
*/
clear_bit(ZONE_CONGESTED &zone->flags);
clear_bit(ZONE_DIRTY &zone->flags);
}
}
if (i < 0)
goto out;
for (i = 0; i <= end_zone; i ) {
struct zone *zone = pgdat->node_zones i;
if (!populated_zone(zone))
continue;
/* 如果内存回收后,足够申请order阶的新内存,则不启动内存压缩
* 所有需要扫描的低端zone只有有一个满足上述条件即可
*/
if (pgdat_needs_compaction &&
zone_watermark_ok(zone order
low_wmark_pages(zone)
*classzone_idx 0))
pgdat_needs_compaction = false;
}
/*
* If we're getting trouble reclaiming start doing writepage
* even in laptop mode.
*/
/* 优先级越低,则要求回收的内存越多 */
if (sc.priority < DEF_PRIORITY - 2)
sc.may_writepage = 1;
/*
* 接下来从低到高扫描zone,停止在end_zone上
*/
for (i = 0; i <= end_zone; i ) {
struct zone *zone = pgdat->node_zones i;
if (!populated_zone(zone))
continue;
if (sc.priority != DEF_PRIORITY &&
!zone_reclaimable(zone))
continue;
sc.nr_scanned = 0;
nr_soft_scanned = 0;
/*
* 回收该zone上超过soft_limit最多的mem_cgroup在该zone上mem_cgroup_per_zone对应的lru链表
* 直接回收、间接回收都会调用该函数,是调用shrink_zone()前的必备动作
* 通过全局mem group结构体soft_limit_tree->rb_tree_per_node->rb_tree_per_zone->rb_root找到mem_cgroup_per_zone
*/
nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone
order sc.gfp_mask
&nr_soft_scanned);
sc.nr_reclaimed = nr_soft_reclaimed;
/*
* There should be no need to raise the scanning
* priority if enough pages are already being scanned
* that that high watermark would be met at 100%
* efficiency.
*/
/* 如果回收成功则不需要降低回收优先级 */
if (kswapd_shrink_zone(zone end_zone
&sc &nr_attempted))
raise_priority = false;
}
/*
* If the low watermark is met there is no need for processes
* to be throttled on pfmemalloc_wait as they should not be
* able to safely make forward progress. Wake them
*/
/* 执行完一轮内存回收后,唤醒挂在pfmemalloc_wait队列中等待进行内存分配的任务
* 1. 通过waitqueue_active判断pfmemalloc_wait队列中是否有成员
* 2. pfmemalloc_watermark_ok判断本zone是否有足够空闲内存用于分配,如果没有会唤醒kswapd线程回收内存
*/
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
pfmemalloc_watermark_ok(pgdat))
wake_up_all(&pgdat->pfmemalloc_wait);
/*
* Fragmentation may mean that the system cannot be rebalanced
* for high-order allocations in all zones. If twice the
* allocation size has been reclaimed and the zones are still
* not balanced then recheck the watermarks at order-0 to
* prevent kswapd reclaiming excessively. Assume that a
* process requested a high-order can direct reclaim/compact.
*/
if (order && sc.nr_reclaimed >= 2UL << order)
order = sc.order = 0;
/* Check if kswapd should be suspending */
if (try_to_freeze() || kthread_should_stop())
break;
/* 如果需要内存压缩则启动内存压缩 */
if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
compact_pgdat(pgdat order);
/*
* Raise priority if scanning rate is too low or there was no
* progress in reclaiming pages
*/
if (raise_priority || !sc.nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1 &&
!pgdat_balanced(pgdat order *classzone_idx));
out:
/*
* Return the order we were reclaiming at so prepare_kswapd_sleep()
* makes a decision on the order we were last reclaiming at. However
* if another caller entered the allocator slow path while kswapd
* was awake order will remain at the higher level
*/
/* 记录本次均衡操作的最终zone id */
*classzone_idx = end_zone;
return order;
}
kswapd_shrink_zone
/*
* 对zone执行内存回收:
* 1. 确定一个zone需要回收的内存数量
* 2. 检测目前zone的内存压力,尽量少执行内存回收
* 3. 通过shrink_zone()实际执行内存回收
*/
static bool kswapd_shrink_zone(struct zone *zone
int classzone_idx
struct scan_control *sc
unsigned long *nr_attempted)
{
int testorder = sc->order;
unsigned long balance_gap;
bool lowmem_pressure;
/* 确定本zone需要回收的页框数量;目标是要使水位回到high levle. */
sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX high_wmark_pages(zone));
/*
* Kswapd reclaims only single pages with compaction enabled. Trying
* too hard to reclaim until contiguous free pages have become
* available can hurt performance by evicting too much useful data
* from memory. Do not reclaim more than needed for compaction.
*/
/* 如果可以内存压缩可以解决需求order,只要回收单个页面即可.避免连续回收页面对系统性能造成影响 */
if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
compaction_suitable(zone sc->order 0 classzone_idx)
!= COMPACT_SKIPPED)
testorder = 0;
/*
* We put equal pressure on every zone unless one zone has way too
* many pages free already. The "too many pages" is defined as the
* high wmark plus a "gap" where the gap is either the low
* watermark or 1% of the zone whichever is smaller.
*/
balance_gap = min(low_wmark_pages(zone) DIV_ROUND_UP(
zone->managed_pages KSWAPD_ZONE_BALANCE_GAP_RATIO));
/*
* If there is no low memory pressure or the zone is balanced then no
* reclaim is necessary
*/
lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
/* 如果低端内存压力不大,则不需要进行内存回收 */
if (!lowmem_pressure && zone_balanced(zone testorder
balance_gap classzone_idx))
return true;
/* 内存回收入口函数 */
shrink_zone(zone sc zone_idx(zone) == classzone_idx);
/* 计算尝试回收的各个zone内存数量总和 */
*nr_attempted = sc->nr_to_reclaim;
clear_bit(ZONE_WRITEBACK &zone->flags);
/*
* If a zone reaches its high watermark consider it to be no longer
* congested. It's possible there are dirty pages backed by congested
* BDIs but as pressure is relieved speculatively avoid congestion
* waits.
*/
if (zone_reclaimable(zone) &&
zone_balanced(zone testorder 0 classzone_idx)) {
clear_bit(ZONE_CONGESTED &zone->flags);
clear_bit(ZONE_DIRTY &zone->flags);
}
return sc->nr_scanned >= sc->nr_to_reclaim;
}
总结
- 开始标志:zonelist的所有zone都不能通过min阀值获取到页框时,会唤醒所有node的kswapd内核线程,然后在kswapd中会对不满足 zone分配页框后剩余的页框数量 > 此zone的high阀值 此zone保留的页框数量 的zone进行内存回收
- 结束标志:node中所有zone都满足 zone分配页框后剩余的页框数量 > 此zone的high阀值 此zone保留的页框数量(可能会进行多次shrink_zone()的调用)
- 回收对象:超过所在memcg的soft_limit_in_bytes的进程的内存、zone的可回收页框、slab
__alloc_pages()->__alloc_pages_nodemask()->get_page_from_freelist()->zone_reclaim->__zone_reclaim()->shrink_zone()
__zone_reclaim/*
* 快速内存分配失败后触发本函数进行内存回收;但是并不能保证回收后必然可以分配order阶内存块?
* 1. 根据目前zone_reclaim_mode确定内存回收sc控制结构体
* 2. zone_pagecache_reclaimable:根据zone_reclaim_mdoe状态计算可供回收的页面数量
* 3. 对不同priority级别进行循环内存回收,直到回收到足够的页面
*/
static int __zone_reclaim(struct zone *zone gfp_t gfp_mask unsigned int order)
{
/* Minimum pages needed in order to stay on node */
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
struct reclaim_state reclaim_state;
struct scan_control sc = {
/* 最少一次回收SWAP_CLUSTER_MAX,最多一次回收1 << order个,应该是1024个 */
.nr_to_reclaim = max(nr_pages SWAP_CLUSTER_MAX)
/* 当前进程明确禁止分配内存的IO操作(禁止__GFP_IO,__GFP_FS标志),那么则清除__GFP_IO,__GFP_FS标志,表示不进行IO操作 */
.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask))
.order = order
/* 优先级为4,默认是12,会比12一次扫描更多lru链表中的页框,而且扫描次数会比优先级为12的少,并且如果回收过程中回收到了足够页框,就会返回 */
.priority = ZONE_RECLAIM_PRIORITY
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE) /* 根据zone_reclaim_mdoe判断,允许file页回写磁盘操作 */
.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP) /* 根据zone_reclaim_mdoe判断,允许unmap操作 */
.may_swap = 1 /* 允许匿名页swap */
};
cond_resched();
/*
* We need to be able to allocate from the reserves for RECLAIM_SWAP
* and we also need to be able to write out pages for RECLAIM_WRITE
* and RECLAIM_SWAP.
*/
p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
lockdep_set_current_reclaim_state(gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
/*
* 根据目前的zone_reclaim_mode计算有多少潜在的page可以被回收
* 0x0:只会对最有zone附近的几个需要进行内存回收的zone进行内存回收;其余的会对所有zone回收
* 0x1:开启zone的内存回收
* 0x2:允许回写
* 0x4:允许unmap操作
*/
if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
/*
* Free memory by calling shrink zone with increasing
* priorities until we have enough memory freed.
*/
do {
/* 内存回收入口函数,对本zone进行循环内存回收,直到收集到足够的页面 */
shrink_zone(zone &sc true);
/* 最多进行4次调用shrink_zone(),并且每次调用shrink_zone()扫描的页框会越来越多,直到回收到了1<<order个页框为止 */
} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
}
p->reclaim_state = NULL;
current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
lockdep_clear_current_reclaim_state();
return sc.nr_reclaimed >= nr_pages;
}
总结
- 开始标志是:此zone分配后剩余的页框数量 > 此zone的阀值 此zone的保留页框数量(阀值可能是:min low high其中一个)
- 结束标志是:对此zone回收到了本次分配时需要的页框数量 或者 sc->priority降为0(可能会进行多次shrink_zone()的调用)
- 回收对象:zone的可回收页框、slab
__alloc_pages()->__alloc_pages_nodemask()->__alloc_pages_slowpath()->__alloc_pages_direct_reclaim()->__perform_reclaim()->try_to_free_pages()->do_try_to_free_pages()->shrink_zone()
try_to_free_pages/* 初始化内存回收控制结构体,并检测当前进程是否需要进入pgdat->pfmemalloc_wait等待队列
* 1. 初始化sc,回收数量目标是SWAP_CLUSTER_MAX?
* 2. throttle_direct_reclaim:判断当前内存申请进程是否要进入等待队列
* 3. do_try_to_free_pages:触发内存回收;直到回收到目标数量page或内存压缩可以解决问题
*/
unsigned long try_to_free_pages(struct zonelist *zonelist int order
gfp_t gfp_mask nodemask_t *nodemask)
{
unsigned long nr_reclaimed;
struct scan_control sc = {
.nr_to_reclaim = SWAP_CLUSTER_MAX /* 只打算回收的页框为32个? */
.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask))
.order = order
.nodemask = nodemask
.priority = DEF_PRIORITY
/* 与/proc/sys/vm/laptop_mode文件有关
* laptop_mode为0,则允许进行回写操作,即使允许回写,直接内存回收也不能对脏文件页进行回写
* 不过允许回写时,可以对非文件页进行回写
*/
.may_writepage = !laptop_mode
.may_unmap = 1 /* 允许进行unmap操作 */
.may_swap = 1 /* 允许进行非文件页的操作 */
};
/* 判断node是否平衡,如果平衡,则继续进行内存回收
* 如果不平衡,如果此node的kswapd没有被唤醒,则唤醒,并且这里唤醒kswapd只会对ZONE_NORMAL以下的zone进行内存回收
* 此node的ZONE_DMA和ZONE_NORMAL的总共空闲页框数量 > 此node的ZONE_DMA和ZONE_NORMAL的平均min阀值数量
*/
if (throttle_direct_reclaim(gfp_mask zonelist nodemask))
return 1;
trace_mm_vmscan_direct_reclaim_begin(order
sc.may_writepage
gfp_mask);
nr_reclaimed = do_try_to_free_pages(zonelist &sc);
trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
return nr_reclaimed;
}
throttle_direct_reclaim
/*
* 判断当前进程是否加入到pgdat->pfmemalloc_wait等待队列中
* 1. 如果为内核进程,则返回
* 2. 如果当前进程已接收到kill信号,则返回
* 3. 判断最优node(最优zone所属的node)是否pfmemalloc balance,否则加入pgdat->pfmemalloc_wait等待队列中
a. 有__GFP_FS: 加入队列没有超时限制,并且进程状态为TASK_KILLABLE
b. 无__GFP_FS: 超时时间为1s,且状态为TASK_INTERRUPTABLE
*/
static bool throttle_direct_reclaim(gfp_t gfp_mask struct zonelist *zonelist
nodemask_t *nodemask)
{
struct zoneref *z;
struct zone *zone;
pg_data_t *pgdat = NULL;
/* 如果标记了PF_KTHREAD,表示此进程是一个内核线程,则不会往下执行 */
if (current->flags & PF_KTHREAD)
goto out;
/*
* If a fatal signal is pending this process should not throttle.
* It should return quickly so it can exit and free its memory
*/
/* 此进程已经接收到了kill信号,准备要被杀掉了 */
if (fatal_signal_pending(current))
goto out;
/* 遍历zonelist,但是里面只会在获取到第一个pgdat时就跳出
* 但不能是zone_highmem,因为有可能pfmemalloc队列中等待的任务有GFP_KERNEL分配标志,不能从zone_highmem分配内存
*/
for_each_zone_zonelist_nodemask(zone z zonelist
gfp_zone(gfp_mask) nodemask) {
/* 只遍历ZONE_NORMAL和ZONE_DMA区 */
if (zone_idx(zone) > ZONE_NORMAL)
continue;
/* 获取zone对应的node */
pgdat = zone->zone_pgdat;
/* 判断node是否平衡,如果平衡,则返回真
* 如果不平衡,如果此node的kswapd没有被唤醒,则唤醒,并且这里唤醒kswapd只会对ZONE_NORMAL以下的zone进行内存回收
* node是否平衡的判断标准是:
* 此node的ZONE_DMA和ZONE_NORMAL的总共空闲页框数量 > 此node的ZONE_DMA和ZONE_NORMAL的平均min阀值数量
*/
if (pfmemalloc_watermark_ok(pgdat))
goto out;
break;
}
/* If no zone was usable by the allocation flags then do not throttle */
if (!pgdat)
goto out;
/* Account for the throttling */
count_vm_event(PGSCAN_DIRECT_THROTTLE);
/* 如果分配标志禁止了文件系统操作,则将要进行内存回收的进程设置为TASK_INTERRUPTIBLE状态,然后加入到node的pgdat->pfmemalloc_wait,并且会设置超时时间为1s
* 1.pfmemalloc_watermark_ok(pgdat)为真时被唤醒,而1s没超时,返回剩余timeout(jiffies)
* 2.睡眠超过1s时会唤醒,而pfmemalloc_watermark_ok(pgdat)此时为真,返回1
* 3.睡眠超过1s时会唤醒,而pfmemalloc_watermark_ok(pgdat)此时为假,返回0
* 4.接收到信号被唤醒,返回-ERESTARTSYS
*/
if (!(gfp_mask & __GFP_FS)) {
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait
pfmemalloc_watermark_ok(pgdat) HZ);
goto check_pending;
}
/* 如果分配标志没有禁止了文件系统操作
* 将本进程加入到node的pgdat->pfmemalloc_wait,并设置为TASK_KILLABLE状态,表示允许 TASK_UNINTERRUPTIBLE 响应致命信号的状态
* 这些进程在两种情况下被唤醒
* 1.pfmemalloc_watermark_ok(pgdat)为真时
* 2.接收到致命信号时
*/
wait_event_killable(zone->zone_pgdat->pfmemalloc_wait
pfmemalloc_watermark_ok(pgdat));
check_pending:
/* 如果加入到了pgdat->pfmemalloc_wait后被唤醒,就会执行到这
* 唤醒后再次检查当前进程是否接受到了kill信号,准备退出
*/
if (fatal_signal_pending(current))
return true;
out:
return false;
}
do_try_to_free_pages
/*
* 尝试对所有优先级别进行内存回收,直到以下两个条件或成立
* 1. 已经回收到足够的页面
* 2. 回收到的页面已经足够,只要启动内存压缩即可满足要求
* 3. 期间如果累计扫描过的页面较多,会触发所有bdi设备启动flush回写线程
*/
static unsigned long do_try_to_free_pages(struct zonelist *zonelist
struct scan_control *sc)
{
int initial_priority = sc->priority;
unsigned long total_scanned = 0;
unsigned long writeback_threshold;
bool zones_reclaimable;
retry:
delayacct_freepages_start();
if (global_reclaim(sc))
count_vm_event(ALLOCSTALL);
/*
* 尝试对所有优先级别进行内存回收,直到以下两个条件或成立
* 1. 已经回收到足够的页面
* 2. 回收到的页面已经足够,只要启动内存压缩即可满足要求
*/
do {
vmpressure_prio(sc->gfp_mask sc->target_mem_cgroup
sc->priority);
sc->nr_scanned = 0;
zones_reclaimable = shrink_zones(zonelist sc);
total_scanned = sc->nr_scanned;
/* 回收到足够的page,则退出 */
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
break;
if (sc->compaction_ready)
break;
/*
* If we're getting trouble reclaiming start doing
* writepage even in laptop mode.
*/
if (sc->priority < DEF_PRIORITY - 2)
sc->may_writepage = 1;
/*
* 如果扫描过的page数量>需要回收页面数量*1.5,还没有完成回收任务,则触发强制回写dirty page到磁盘
* 在default_bdi_init()中会初始化两个线程 sysnc_supers(周期性同步所有superblocks) bdi_default(在必要时create start stop flusher线程)
* 系统每新增一个bdi设备 会通过bdi_register()注册到全局bdi_list链表中,每个bdi设备都会有自己的flusher线程,用来回写本设备的脏页
*/
writeback_threshold = sc->nr_to_reclaim sc->nr_to_reclaim / 2;
if (total_scanned > writeback_threshold) {
wakeup_flusher_threads(laptop_mode ? 0 : total_scanned
WB_REASON_TRY_TO_FREE_PAGES);
sc->may_writepage = 1;
}
} while (--sc->priority >= 0);
delayacct_freepages_end();
/* 如果已经回收到page,则返回成功回收的page num */
if (sc->nr_reclaimed)
return sc->nr_reclaimed;
/* Aborted reclaim to try compaction? don't OOM then */
/* ready则表示已经回收到足够页面,但需要内存压缩来整理碎片才能满足内存分配要求 */
if (sc->compaction_ready)
return 1;
/* Untapped cgroup reserves? Don't OOM retry. */
if (!sc->may_thrash) {
sc->priority = initial_priority;
sc->may_thrash = 1;
goto retry;
}
/* Any of the zones still reclaimable? Don't OOM. */
if (zones_reclaimable)
return 1;
return 0;
}
shrink_zones
/*
* This is the direct reclaim path for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
* request.
*
* We reclaim from a zone even if that zone is over high_wmark_pages(zone).
* Because:
* a) The caller may be trying to free *extra* pages to satisfy a higher-order
* allocation or
* b) The target zone may be at high_wmark_pages(zone) but the lower zones
* must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
* zone defense algorithm.
*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*
* Returns true if a zone was reclaimable.
*/
static bool shrink_zones(struct zonelist *zonelist struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
gfp_t orig_mask;
enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
bool reclaimable = false;
/*
* If the number of buffer_heads in the machine exceeds the maximum
* allowed level force direct reclaim to scan the highmem zone as
* highmem pages could be pinning lowmem pages storing buffer_heads
*/
orig_mask = sc->gfp_mask;
if (buffer_heads_over_limit)
sc->gfp_mask |= __GFP_HIGHMEM;
for_each_zone_zonelist_nodemask(zone z zonelist
requested_highidx sc->nodemask) {
enum zone_type classzone_idx;
if (!populated_zone(zone))
continue;
classzone_idx = requested_highidx;
while (!populated_zone(zone->zone_pgdat->node_zones
classzone_idx))
classzone_idx--;
/* 判断是zone级别的还是mem group级别的内存回收 */
if (global_reclaim(sc)) {
if (!cpuset_zone_allowed(zone
GFP_KERNEL | __GFP_HARDWALL))
continue;
if (sc->priority != DEF_PRIORITY &&
!zone_reclaimable(zone))
continue; /* Let kswapd poll it */
/*
* If we already have plenty of memory free for
* compaction in this zone don't free any more.
* Even though compaction is invoked for any
* non-zero order only frequent costly order
* reclamation is disruptive enough to become a
* noticeable problem like transparent huge
* page allocations.
*/
/*
* 检测此zone目前是否能通过内存压缩满足内存分配要求:
* 1. 通过内存压缩能满足order阶内存的分配
* 2. 不用内存压缩也能满足order阶内存的分配
*/
if (IS_ENABLED(CONFIG_COMPACTION) &&
sc->order > PAGE_ALLOC_COSTLY_ORDER &&
zonelist_zone_idx(z) <= requested_highidx &&
compaction_ready(zone sc->order)) {
sc->compaction_ready = true;
continue;
}
/*
* 回收该zone上超过soft_limit最多的mem_cgroup在该zone上mem_cgroup_per_zone对应的lru链表
* 直接回收、间接回收都会调用该函数,是调用shrink_zone()前的必备动作
* 通过全局mem group结构体soft_limit_tree->rb_tree_per_node->rb_tree_per_zone->rb_root找到mem_cgroup_per_zone
*/
nr_soft_scanned = 0;
nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone
sc->order sc->gfp_mask
&nr_soft_scanned);
sc->nr_reclaimed = nr_soft_reclaimed;
sc->nr_scanned = nr_soft_scanned;
if (nr_soft_reclaimed)
reclaimable = true;
/* need some check for avoid more shrink_zone() */
}
/* 如果是zone级别的内存回收,则直接调用shrink_zone(); 只要回收到了内存就返回true */
if (shrink_zone(zone sc zone_idx(zone) == classzone_idx))
reclaimable = true;
if (global_reclaim(sc) &&
!reclaimable && zone_reclaimable(zone))
reclaimable = true;
}
/*
* Restore to original mask to avoid the impact on the caller if we
* promoted it to __GFP_HIGHMEM.
*/
sc->gfp_mask = orig_mask;
return reclaimable;
}
总结
- 慢速分配中,首先唤醒所有node节点的kswap内核线程
- 然后调用get_page_from_freelist()尝试用min阀值从zonelist的zone中获取连续页框
- 如果失败则对zonelist的zone进行异步压缩,然后调用get_page_from_freelist再次以min阀值尝试分配内存
- 如果还是失败则进入内存回收__alloc_pages_direct_reclaim
- 在进行直接内存回收时,进程是有可能加入到node的pgdat->pfmemalloc_wait这个等待队列中,当kswapd进行内存回收后如果node空闲内存达到平衡,那么就会唤醒pgdat->pfmemalloc_wait中的进程
- 开始标志:进程申请内存时,zonelist的所有zone都不能通过min阀值获取到页框时
- 结束标志:回收到32个页框,或者sc->priority降到0,或者空闲页框足够进行内存压缩了(可能会进行多次shrink_zone()的调用)
- 回收对象:超过所在memcg的soft_limit_in_bytes的进程的内存、zone的可回收页框、slab
PG_lru:表示页在lru链表中
PG_referenced: 表示页最近被访问(只有文件页使用)
PG_dirty:页为脏页,文件页被修改,以及非文件页加入到swap cache后,就会被标记为脏页。在此页回写前会被清除,但是回写失败时又会被置位
PG_active:页为活动页,配合PG_lru就可以得出页是处于非活动页lru链表还是活动页lru链表
PG_private:页描述符中的page->private保存有数据
PG_writeback:页正在进行回写
PG_swapbacked:此页可写入swap分区,一般用于表示此页是非文件页
PG_swapcache:页已经加入到了swap cache中(只有非文件页使用)
PG_reclaim:页正在进行回收,只有在内存回收时才会对需要回收的页进行此标记
PG_mlocked:页被锁在内存中
三个影响内存回收的全局参数/proc/sys/vm/zone_reclaim_mode
只影响快速内存回收
- 0x0:快速内存回收只会对最优zone附近的几个需要进行内存回收的zone进行内存回收
- 0x1:开启zone的内存回收
- 0x2:开启zone的内存回收,并且允许回写
- 0x4:开启zone的内存回收,允许进行unmap操作
影响所有内存回收
- 0:允许直接内存回收对匿名页lru链表中的页进行回写操作,并且允许直接内存回收唤醒flush内核线程
- 非0:直接内存回收不会对匿名页lru链表中的页进行回写操作
影响所有内存回收 在shrink_zones中用于确定扫描匿名页lru链表和文件页lru链表的比例,范围是0~200,系统默认是30;
- 接近0:进行内存回收时,更多地去扫描文件页lru链表,如果为0,那么就不会去扫描匿名页lru链表。
- 接近200:进行内存回收时,更多地去扫描匿名页lru链表。
/* 扫描控制结构,用于内存回收和内存压缩 */
struct scan_control {
/* 需要回收的页框数量 */
unsigned long nr_to_reclaim;
/* 申请内存时使用的分配标志 */
gfp_t gfp_mask;
/* 申请内存时使用的order值,因为只有申请内存,然后内存不足时才会进行扫描 */
int order;
/* 允许执行扫描的node结点掩码 */
nodemask_t *nodemask;
/* 目标memcg,如果是针对整个zone进行的,则此为NULL */
struct mem_cgroup *target_mem_cgroup;
/* 扫描优先级,一次扫描(total_size >> priority)个页框
* 优先级越低,一次扫描的页框数量就越多
* 优先级越高,一次扫描的数量就越少
* 默认优先级为12
*/
int priority;
/* 是否能够进行回写操作(与分配标志的__GFP_IO和__GFP_FS有关) */
unsigned int may_writepage:1;
/* 能否进行unmap操作,就是将所有映射了此页的页表项清空 */
unsigned int may_unmap:1;
/* 是否能够进行swap交换,如果不能,在内存回收时则不扫描匿名页lru链表 */
unsigned int may_swap:1;
unsigned int hibernation_mode:1;
/* 扫描结束后会标记,用于内存回收判断是否需要进行内存压缩 */
unsigned int compaction_ready:1;
/* 已经扫描的页框数量 */
unsigned long nr_scanned;
/* 已经回收的页框数量 */
unsigned long nr_reclaimed;
};