前言

Linux 内核把物理页作为内存管理的基本单位。MMU通常是以页为单位来查找页表,内核用 struct page结构体来表示系统的每个页。不同的体系支持的页大小不同。目前我的x86_64机器是一个物理页大小为4K。x86_64支持2MB和1GB的大页。

一、struct page简介

Linux内核将整个物理内存按照页对齐方式划分成千上万个页进行管理。由于一个物理页用一个struct page表示,那么系统会有成千上万个struct page结构体,这些结构体也会占用实际的物理内存,因此,内核选择用union联合体来减少内存的使用。

物理页能有多种用途,比如存放用户进程代码和数据,存放内核代码和数据,用于页表,用于高速缓存,用于内核中动态分配的数据结构,设备驱动程序缓冲的数据,内核模块代码等等。因此内核必须知道物理页是否是空闲状态,属于空闲状态的物理页才能够在一次新的分配内存中分配出去。

union联合是一种特殊的复合数据类型,在其内部定义的所有数据字段将占用同一块内存空间。联合对象的实际大小与其内部所定义最大字段的大小相同。同时union 结构是在 C 语言中被用于同一块内存根据情况保存不同类型数据的一种方式,因此一个物理页面的使用模式可以有多种。
struct page结构体用union联合体表示的原因:
(1)节约内存。
(2)一个物理页可以用来表示有多种使用模式。

// linux-4.18/include/linux/mm_types.h

struct page {1unsigned long flags;		/* Atomic flags, some possibly
					 * updated asynchronously */
	/*
	 * Five words (20/40 bytes) are available in this union.
	 * WARNING: bit 0 of the first word is used for PageTail(). That
	 * means the other users of this union MUST NOT use the bit to
	 * avoid collision and false-positive PageTail().
	 */2union {
		2.1
		struct {	/* Page cache and anonymous pages */
			/**
			 * @lru: Pageout list, eg. active_list protected by
			 * zone_lru_lock.  Sometimes used as a generic list
			 * by the page owner.
			 */
			struct list_head lru;
			/* See page-flags.h for PAGE_MAPPING_FLAGS */
			struct address_space *mapping;
			pgoff_t index;		/* Our offset within mapping. */
			/**
			 * @private: Mapping-private opaque data.
			 * Usually used for buffer_heads if PagePrivate.
			 * Used for swp_entry_t if PageSwapCache.
			 * Indicates order in the buddy system if PageBuddy.
			 */
			unsigned long private;
		};
		2.2
		struct {	/* slab, slob and slub */
			union {
				struct list_head slab_list;	/* uses lru */
				struct {	/* Partial pages */
					struct page *next;
#ifdef CONFIG_64BIT
					int pages;	/* Nr of pages left */
					int pobjects;	/* Approximate count */
#else
					short int pages;
					short int pobjects;
#endif
				};
			};
			struct kmem_cache *slab_cache; /* not slob */
			/* Double-word boundary */
			void *freelist;		/* first free object */
			union {
				void *s_mem;	/* slab: first object */
				unsigned long counters;		/* SLUB */
				struct {			/* SLUB */
					unsigned inuse:16;
					unsigned objects:15;
					unsigned frozen:1;
				};
			};
		};
		2.3
		struct {	/* Tail pages of compound page */
			unsigned long compound_head;	/* Bit zero is set */

			/* First tail page only */
			unsigned char compound_dtor;
			unsigned char compound_order;
			atomic_t compound_mapcount;
		};
		2.4
		struct {	/* Second tail page of compound page */
			unsigned long _compound_pad_1;	/* compound_head */
			unsigned long _compound_pad_2;
			struct list_head deferred_list;
		};
		2.5
		struct {	/* Page table pages */
			unsigned long _pt_pad_1;	/* compound_head */
			pgtable_t pmd_huge_pte; /* protected by page->ptl */
			unsigned long _pt_pad_2;	/* mapping */
			struct mm_struct *pt_mm;	/* x86 pgds only */
#if ALLOC_SPLIT_PTLOCKS
			spinlock_t *ptl;
#else
			spinlock_t ptl;
#endif
		};
		2.6
		struct {	/* ZONE_DEVICE pages */
			/** @pgmap: Points to the hosting device page map. */
			struct dev_pagemap *pgmap;
			unsigned long hmm_data;
			unsigned long _zd_pad_1;	/* uses mapping */
		};

		/** @rcu_head: You can use this to free a page by RCU. */
		struct rcu_head rcu_head;
	};3union {		/* This union is 4 bytes in size. */
		/*
		 * If the page can be mapped to userspace, encodes the number
		 * of times this page is referenced by a page table.
		 */
		atomic_t _mapcount;

		/*
		 * If the page is neither PageSlab nor mappable to userspace,
		 * the value stored here may help determine what this page
		 * is used for.  See page-flags.h for a list of page types
		 * which are currently stored here.
		 */
		unsigned int page_type;

		unsigned int active;		/* SLAB */
		int units;			/* SLOB */
	};4/* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
	atomic_t _refcount;5#ifdef CONFIG_MEMCG
	struct mem_cgroup *mem_cgroup;
#endif

	/*
	 * On machines where all RAM is mapped into kernel address space,
	 * we can simply calculate the virtual address. On machines with
	 * highmem some memory is mapped into kernel virtual memory
	 * dynamically, so we need a place to store that address.
	 * Note that this field could be 16 bits on x86 ... ;)
	 *
	 * Architectures with slow multiplication can define
	 * WANT_PAGE_VIRTUAL in asm/page.h
	 */
#if defined(WANT_PAGE_VIRTUAL)
	void *virtual;			/* Kernel virtual address (NULL if
					   not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */

#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
	int _last_cpupid;
#endif
} _struct_page_alignment;

上面的内容比较多,简写以下如下所示,可以看到主要由两个union组成,第一个union联合体占用40个字节(32位系统下为20个字节),第二个联合体占4个字节。

struct page {1unsigned long flags;		/* Atomic flags, some possibly
					 * updated asynchronously */
	/*
	 * Five words (20/40 bytes) are available in this union.
	 * WARNING: bit 0 of the first word is used for PageTail(). That
	 * means the other users of this union MUST NOT use the bit to
	 * avoid collision and false-positive PageTail().
	 */2union {
	 	 2.1
		 struct {	/* Page cache and anonymous pages */
		 }
		 2.2
		 struct {	/* slab, slob and slub */
		 }
		 2.3
		 struct {	/* Tail pages of compound page */
		 }
		 2.4
		 struct {	/* Second tail page of compound page */
		 }
		 2.5
		 struct {	/* Page table pages */
		 }
		 2.6
		 struct {	/* ZONE_DEVICE pages */
		 }
	 }
	
	3
	union {		/* This union is 4 bytes in size. */
	}

	4
	/* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
	atomic_t _refcount;

	5
	/* 一些编译配置选项 */ 
	
}

如图所示:
在这里插入图片描述

二、flags标志位

该结构体成员与体系架构无关,主要用来存放页的属性,页的各种不同属性通过一系列标志描述,存储在flags成员的各个bit中,比如该页是否被锁定中(如果该bit置位,表示内核的其他部分不允许访问该页,防止内存管理出现竞态条件)、该页是否能够被回收、该页最近是否被访问过、该页数据是否是脏的,该页是否用于slab分配器等等。

// linux-4.18/include/linux/page-flags.h

/*
 * Don't use the *_dontuse flags.  Use the macros.  Otherwise you'll break
 * locked- and dirty-page accounting.
 *
 * The page flags field is split into two parts, the main flags area
 * which extends from the low bits upwards, and the fields area which
 * extends from the high bits downwards.
 *
 *  | FIELD | ... | FLAGS |
 *  N-1           ^       0
 *               (NR_PAGEFLAGS)
 *
 * The fields area is reserved for fields mapping zone, node (for NUMA) and
 * SPARSEMEM section (for variants of SPARSEMEM that require section ids like
 * SPARSEMEM_EXTREME with !SPARSEMEM_VMEMMAP).
 */
enum pageflags {
	PG_locked,		/* Page is locked. Don't touch. */
	PG_error,		/* 该page的IO操作期间发生error,则置位该bit */
	PG_referenced,	/* 该页最近被访问过,控制系统使用该页的活跃程度 */
	PG_uptodate,	/* 表示该页的数据已经从块设备读取,期间没有出错,页的数据已经更新,与块设备上的数据一致 */
	PG_dirty,		/* 该页数据被修改了,与块设备(硬盘)上的数据不一致 ,这表示物理页的数据与硬盘上数据不一致,内核会在一个合适的机会刷新*/
	PG_lru,			/* 该页处于lru链表中 ,内核使用两个lru链表来区分活跃页和补活跃页*/
	PG_active,		/* 该页处于active lru链表中 */
	PG_waiters,		/* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
	PG_slab,		/* 该页用来实现于slab分配器,将该页拆分为多个相等的对象块 */
	PG_owner_priv_1,	/* Owner use. If pagecache, fs may use*/
	PG_arch_1,
	PG_reserved,	/* 置位表示禁止该页被交换到swap */
	PG_private,		/* If pagecache, has fs-private data */
	PG_private_2,		/* If pagecache, has fs aux data */
	PG_writeback,		/* Page is under writeback */
	PG_head,		/* A head page */
	PG_mappedtodisk,	/* Has blocks allocated on-disk */
	PG_reclaim,		/* To be reclaimed asap */
	PG_swapbacked,		/* Page is backed by RAM/swap */
	PG_unevictable,		/* Page is "unevictable"  */
#ifdef CONFIG_MMU
	PG_mlocked,		/* Page is vma mlocked */
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
	PG_uncached,		/* Page has been mapped as uncached */
#endif
#ifdef CONFIG_MEMORY_FAILURE
	PG_hwpoison,		/* hardware poisoned page. Don't touch */
#endif
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
	PG_young,
	PG_idle,
#endif
	__NR_PAGEFLAGS,

	/* Filesystems */
	PG_checked = PG_owner_priv_1,

	/* SwapBacked */
	PG_swapcache = PG_owner_priv_1,	/* Swap page: swp_entry_t in private */

	/* Two page bits are conscripted by FS-Cache to maintain local caching
	 * state.  These bits are set on pages belonging to the netfs's inodes
	 * when those inodes are being locally cached.
	 */
	PG_fscache = PG_private_2,	/* page backed by cache */

	/* XEN */
	/* Pinned in Xen as a read-only pagetable page. */
	PG_pinned = PG_owner_priv_1,
	/* Pinned as part of domain save (see xen_mm_pin_all()). */
	PG_savepinned = PG_dirty,
	/* Has a grant mapping of another (foreign) domain's page. */
	PG_foreign = PG_owner_priv_1,

	/* SLOB */
	PG_slob_free = PG_private,

	/* Compound pages. Stored in first tail page's flags */
	PG_double_map = PG_private_2,

	/* non-lru isolated movable page */
	PG_isolated = PG_reclaim,
};

还有一系列宏用来操作该flags的各个标志位,比如:
SetPageXXX用来设置XXXbit。
ClearPageXXX用来清除XXXbit。
PageXXX用来查询是否置位。

static __always_inline void SetPageUptodate(struct page *page)
{
	VM_BUG_ON_PAGE(PageTail(page), page);
	/*
	 * Memory barrier must be issued before setting the PG_uptodate bit,
	 * so that all previous stores issued in order to bring the page
	 * uptodate are actually visible before PageUptodate becomes true.
	 */
	smp_wmb();
	set_bit(PG_uptodate, &page->flags);
}
ClearPageActive(page);
PageSlab(page);

对于64位系统,flags是64位。
FIELD字段为映射zone、node(NUMA)和SPARSEMEM section保留。从NR_PAGEFLAGS以上的部分就用做其它用途,不在用于也属性标志位。用来描述zone、node(NUMA)和SPARSEMEM section。

 *  | FIELD | ... | FLAGS |
 *  N-1           ^       0
 *               (NR_PAGEFLAGS)
 *

在这里插入图片描述
主要分为4部分,其中标志位flag向高位增长,其余位字段向低位增长,中间存在空闲位。
section:主要用于内存模型SPARSEMEM。
node:NUMA节点号,标识该page属于哪一个节点。
zone:内存域标志,标识该page属于哪一个zone。

三、第一个union

该union有多个结构体,表示一个物理页面使用模式有多种,有多种使用场景。

	/*
	 * Five words (20/40 bytes) are available in this union.
	 * WARNING: bit 0 of the first word is used for PageTail(). That
	 * means the other users of this union MUST NOT use the bit to
	 * avoid collision and false-positive PageTail().
	 */
	 union {
		 struct {	/* Page cache and anonymous pages */
		 }
		 struct {	/* slab, slob and slub */
		 }
		 struct {	/* Tail pages of compound page */
		 }
		 struct {	/* Second tail page of compound page */
		 }
		 struct {	/* Page table pages */
		 }
		 struct {	/* ZONE_DEVICE pages */
		 }
	 }

3.1 页缓存和匿名页

当该物理页使用该模式时,表示该页用于页缓存和匿名页。这种模式要用就用一整页。这一整页的内存,或者直接和虚拟地址空间建立映射关系,我们把这种称为匿名页(Anonymous Page)。或者用于关联一个文件,然后再和虚拟地址空间建立映射关系,这样的文件,我们称为内存映射文件(Memory-mapped File)。

	struct {	/* Page cache and anonymous pages */
		/**
		 * @lru: Pageout list, eg. active_list protected by
		 * zone_lru_lock.  Sometimes used as a generic list
		 * by the page owner.
		 */
		struct list_head lru;
		/* See page-flags.h for PAGE_MAPPING_FLAGS */
		struct address_space *mapping;
		pgoff_t index;		/* Our offset within mapping. */
		/**
		 * @private: Mapping-private opaque data.
		 * Usually used for buffer_heads if PagePrivate.
		 * Used for swp_entry_t if PageSwapCache.
		 * Indicates order in the buddy system if PageBuddy.
		 */
		unsigned long private;
	};

list_head lru 表示这一页应该在 lru 链表(两个链表,active_list 和 generic list)上,例如如果页面被分配,则会根据页面的激活状态,挂接到active list链表中。

struct address_space *mapping 指定了物理页所在的地址空间,就是用于内存映射,如果是匿名页,最低位为 1,mapping实际上指向的是struct anon_vma *结构;如果是映射文件,最低位为 0,mapping指向的是struct address_space *结构;该结构体对于实现匿名页的逆向映射非常重要。因此该指针有双重的使用技巧。同时可以用来判断该物理页是否属于未关联到地址空间的某个匿名内存区

pgoff_t index 是物理页在映射区的偏移量;

判断是否是匿名页:

// linux-4.18/include/linux/page-flags.h

/*
 * On an anonymous page mapped into a user virtual memory area,
 * page->mapping points to its anon_vma, not to a struct address_space;
 * with the PAGE_MAPPING_ANON bit set to distinguish it. 
 *
 */
static __always_inline int PageAnon(struct page *page)
{
	page = compound_head(page);
	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
}

判断是否用于页缓存:

// linux-4.18/include/linux/mm_inline.h

/**
 * page_is_file_cache - should the page be on a file LRU or anon LRU?
 * @page: the page to test
 *
 * Returns 1 if @page is page cache page backed by a regular filesystem,
 * or 0 if @page is anonymous, tmpfs or otherwise ram or swap backed.
 * Used by functions that manipulate the LRU lists, to sort a page
 * onto the right LRU list.
 *
 * We would like to get this info without a page flag, but the state
 * needs to survive until the page is last deleted from the LRU, which
 * could be as far down as __page_cache_release.
 */
static inline int page_is_file_cache(struct page *page)
{
	return !PageSwapBacked(page);
}

3.2 sla(u)b

当物理页使用该模式时,物理页被划分为多块相等的小内存,以字节为单位,而不是向上面一样整页整页的使用,例如分配一个 task_struct 结构,只需要分配小块的内存,去存储这个进程描述结构的对象。这里默认以slub分配器为说明,采用该结构体表示该页已经被slub分配器做小内存所管理,

		struct {	/* slab, slob and slub */
			union {
				struct list_head slab_list;	/* uses lru */
				struct {	/* Partial pages */
					struct page *next;
#ifdef CONFIG_64BIT
					int pages;	/* Nr of pages left */
					int pobjects;	/* Approximate count */
#else
					short int pages;
					short int pobjects;
#endif
				};
			};
			struct kmem_cache *slab_cache; /* not slob */
			/* Double-word boundary */
			void *freelist;		/* first free object */
			union {
				void *s_mem;	/* slab: first object */
				unsigned long counters;		/* SLUB */
				struct {			/* SLUB */
					unsigned inuse:16;
					unsigned objects:15;
					unsigned frozen:1;
				};
			};
		};

只介绍与slub分配器有关的成员:
struct kmem_cache *slab_cache指向其所属的slub管理器:struct kmem_cache。

void *freelist指向所属slab的第一个free object。

unsigned long counters:slub分配器用来计数。

为了节省存储空间,使用位域来表示以下变量:
unsigned inuse:16 表示该页已经使用slub对象的个数,用16位标识。
unsigned objects:15 表示该页slub对象的个数,用15位标识。
unsigned frozen:1 表示slub在cpu_slub中(cpu_slub:per-cpu变量,用来实现每个CPU上的slab缓存)。

位域是把一个字节中的二进位划分为几个不同的区域,并说明每个区域的位数。每个域有一个域名,允许在程序中按域名进行操作。这样就可以把几个不同的对象用一个字节的二进制位域来表示。位域在本质上就是一种结构类型,不过其成员是按二进位分配的。

3.3 compound page

compound 相关的变量用于复合页(Compound Page),复合页面只是将两个或多个物理上相邻连续的页面组合成一个单元,在许多方面复合页可以被视为一个更大的页面。它们最常用于创建巨大的页面,在hugetlbfs或transparent huge pages子系统中使用,但它们也出现在其他上下文中。复合页面可以用作匿名内存,也可以用作内核内的缓冲区;然而,它们不能出现在page cache中,page cache只准备处理单一页面。

注意,复合页面不同于从正常的高阶分配请求返回的页面。类似于:

 pages = alloc_pages(GFP_KERNEL, 2);  /* no __GFP_COMP */

将返回四个物理上连续的页面,但它们不会是一个复合页面。不同之处在于,创建复合页面需要创建相当数量的元数据;很多时候,元数据是不需要的,因此可以避免创建元数据的开销。

关于复合页请参考:https://lwn.net/Articles/619514/

四、第二个union

	union {		/* This union is 4 bytes in size. */
		/*
		 * If the page can be mapped to userspace, encodes the number
		 * of times this page is referenced by a page table.
		 */
		atomic_t _mapcount;

		/*
		 * If the page is neither PageSlab nor mappable to userspace,
		 * the value stored here may help determine what this page
		 * is used for.  See page-flags.h for a list of page types
		 * which are currently stored here.
		 */
		unsigned int page_type;

		unsigned int active;		/* SLAB */
		int units;			/* SLOB */
	};

atomic_t _mapcount表示 内存管理子系统中映射的页表项计数,统级指向该物理页的页表数目,如果可以将该物理页页映射到用户空间,encode对page table引用该页的次数。同时可以用来判断该页是否已经被映射,_mapcount为-1,则代表没有被page table映射。_mapcount等于0时表示只有一个进程使用被映射,当大于0时代表除了该进程外还有其他进程使用这个页面。

unsigned int page_type表示该物理页的使用类型:

// linux-4.18/include/linux/page-flags.h

/*
 * For pages that are never mapped to userspace (and aren't PageSlab),
 * page_type may be used.  Because it is initialised to -1, we invert the
 * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and
 * __ClearPageFoo *sets* the bit used for PageFoo.  We reserve a few high and
 * low bits so that an underflow or overflow of page_mapcount() won't be
 * mistaken for a page type value.
 */

#define PAGE_TYPE_BASE	0xf0000000
/* Reserve		0x0000007f to catch underflows of page_mapcount */
#define PG_buddy	0x00000080
#define PG_balloon	0x00000100
#define PG_kmemcg	0x00000200
#define PG_table	0x00000400

相关物理页类型API:

#define PageType(page, flag)						\
	((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)

#define PAGE_TYPE_OPS(uname, lname)					\
static __always_inline int Page##uname(struct page *page)		\
{									\
	return PageType(page, PG_##lname);				\
}									\
static __always_inline void __SetPage##uname(struct page *page)		\
{									\
	VM_BUG_ON_PAGE(!PageType(page, 0), page);			\
	page->page_type &= ~PG_##lname;					\
}									\
static __always_inline void __ClearPage##uname(struct page *page)	\
{									\
	VM_BUG_ON_PAGE(!Page##uname(page), page);			\
	page->page_type |= PG_##lname;					\
}

/*
 * PageBuddy() indicates that the page is free and in the buddy system
 * (see mm/page_alloc.c).
 */
PAGE_TYPE_OPS(Buddy, buddy)

/*
 * PageBalloon() is true for pages that are on the balloon page list
 * (see mm/balloon_compaction.c).
 */
PAGE_TYPE_OPS(Balloon, balloon)

/*
 * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on
 * pages allocated with __GFP_ACCOUNT. It gets cleared on page free.
 */
PAGE_TYPE_OPS(Kmemcg, kmemcg)

/*
 * Marks pages in use as page tables.
 */
PAGE_TYPE_OPS(Table, table)

五、_refcount

_refcount表示该物理页的引用次数,即该页被引用了多少次。当计数值等于-1时,表示当前内核并没有该物理页,这样在一次新的分配物理内存时可以使用该物理页。
该引用计数成员相关的API:

// linux-4.18/include/linux\page_ref.h

static inline int page_ref_count(struct page *page)
{
	return atomic_read(&page->_refcount);
}

static inline int page_count(struct page *page)
{
	return atomic_read(&compound_head(page)->_refcount);
}

static inline void set_page_count(struct page *page, int v)
{
	atomic_set(&page->_refcount, v);
	if (page_ref_tracepoint_active(__tracepoint_page_ref_set))
		__page_ref_set(page, v);
}

/*
 * Setup the page count before being freed into the page allocator for
 * the first time (boot or memory hotplug)
 */
static inline void init_page_count(struct page *page)
{
	set_page_count(page, 1);
}

static inline void page_ref_add(struct page *page, int nr)
{
	atomic_add(nr, &page->_refcount);
	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
		__page_ref_mod(page, nr);
}

static inline void page_ref_sub(struct page *page, int nr)
{
	atomic_sub(nr, &page->_refcount);
	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
		__page_ref_mod(page, -nr);
}

static inline void page_ref_inc(struct page *page)
{
	atomic_inc(&page->_refcount);
	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
		__page_ref_mod(page, 1);
}

static inline void page_ref_dec(struct page *page)
{
	atomic_dec(&page->_refcount);
	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
		__page_ref_mod(page, -1);
}

......

参考资料

Linux 4.18.0

深入Linux内核架构
https://blog.csdn.net/weixin_42730667/article/details/119190381
https://cloud.tencent.com/developer/article/1374640
https://blog.csdn.net/weixin_42318651/article/details/108248813
https://blog.csdn.net/pwl999/article/details/109539348

Logo

为开发者提供学习成长、分享交流、生态实践、资源工具等服务,帮助开发者快速成长。

更多推荐