|
Message
From: cvs at opencores.org<cvs@o...>
Date: Tue Dec 20 12:47:05 CET 2005
Subject: [cvs-checkins] MODIFIED: or1k ...
Date: 00/05/12 20:12:47 Added: or1k/rc203soc/sw/uClinux/mm Makefile filemap.c kmalloc.c memory.c mlock.c mmap.c mprotect.c mremap.c page_alloc.c page_io.c swap.c swap_state.c swapfile.c vmalloc.c vmscan.c Log: First Import of RC20x uClinux Revision Changes Path 1.1 or1k/rc203soc/sw/uClinux/mm/Makefile http://www.opencores.org/cvsweb.shtml/or1k/rc203soc/sw/uClinux/mm/Makefile?rev=1.1&content-type=text/x-cvsweb-markup Index: Makefile =================================================================== # # Makefile for the linux memory manager. # # Note! Dependencies are done automagically by 'make dep', which also # removes any old dependencies. DON'T put your own dependencies here # unless it's something special (ie not a .c file). # # Note 2! The CFLAGS definition is now in the main makefile... O_TARGET := mm.o O_OBJS := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ kmalloc.o vmalloc.o \ swap.o vmscan.o page_io.o page_alloc.o swap_state.o swapfile.o include $(TOPDIR)/Rules.make 1.1 or1k/rc203soc/sw/uClinux/mm/filemap.c http://www.opencores.org/cvsweb.shtml/or1k/rc203soc/sw/uClinux/mm/filemap.c?rev=1.1&content-type=text/x-cvsweb-markup Index: filemap.c =================================================================== /* * linux/mm/filemap.c * * Copyright (C) 1994, 1995 Linus Torvalds */ /* * This file handles the generic file mmap semantics used by * most "normal" filesystems (but you don't /have/ to use this: * the NFS filesystem does this differently, for example) */ #include <linux/config.h> /* CONFIG_READA_SMALL */ #include <linux/stat.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/shm.h> #include <linux/errno.h> #include <linux/mman.h> #include <linux/string.h> #include <linux/malloc.h> #include <linux/fs.h> #include <linux/locks.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <asm/segment.h> #include <asm/system.h> #include <asm/pgtable.h> /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. * * Shared mappings now work. 15.8.1995 Bruno. */ unsigned long page_cache_size = 0; struct page * page_hash_table[PAGE_HASH_SIZE]; /* * Simple routines for both non-shared and shared mappings. */ #define release_page(page) __free_page((page)) /* * Invalidate the pages of an inode, removing all pages that aren't * locked down (those are sure to be up-to-date anyway, so we shouldn't * invalidate them). */ void invalidate_inode_pages(struct inode * inode) { struct page ** p; struct page * page; p = &inode->i_pages; while ((page = *p) != NULL) { if (PageLocked(page)) { p = &page->next; continue; }
inode->i_nrpages--;
if ((*p = page->next) != NULL)
(*p)->prev = page->prev;
page->dirty = 0;
page->next = NULL;
page->prev = NULL;
remove_page_from_hash_queue(page);
page->inode = NULL;
__free_page(page);
continue;
}
}
/*
* Truncate the page cache at a set offset, removing the pages
* that are beyond that offset (and zeroing out partial pages).
*/
void truncate_inode_pages(struct inode * inode, unsigned long start)
{
struct page ** p;
struct page * page;
repeat:
p = &inode->i_pages;
while ((page = *p) != NULL) {
unsigned long offset = page->offset;
/* page wholly truncated - free it */
if (offset >= start) {
if (PageLocked(page)) {
__wait_on_page(page);
goto repeat;
}
inode->i_nrpages--;
if ((*p = page->next) != NULL)
(*p)->prev = page->prev;
page->dirty = 0;
page->next = NULL;
page->prev = NULL;
remove_page_from_hash_queue(page);
page->inode = NULL;
__free_page(page);
continue;
}
p = &page->next;
offset = start - offset;
/* partial truncate, clear end of page */
if (offset < PAGE_SIZE) {
unsigned long address = page_address(page);
memset((void *) (offset + address), 0, PAGE_SIZE - offset);
flush_page_to_ram(address);
}
}
}
int shrink_mmap(int priority, int dma, int free_buf)
{
static int clock = 0;
struct page * page;
unsigned long limit = MAP_NR(high_memory);
struct buffer_head *tmp, *bh;
int count_max, count_min;
count_max = (limit<<1) >> (priority>>1);
count_min = (limit<<1) >> (priority);
page = mem_map + clock;
do {
count_max--;
if (page->inode || page->buffers)
count_min--;
if (PageLocked(page))
goto next;
if (dma && !PageDMA(page))
goto next;
/* First of all, regenerate the page's referenced bit
from any buffers in the page */
bh = page->buffers;
if (bh) {
tmp = bh;
do {
if (buffer_touched(tmp)) {
clear_bit(BH_Touched, &tmp->b_state);
set_bit(PG_referenced, &page->flags);
}
tmp = tmp->b_this_page;
} while (tmp != bh);
}
/* We can't throw away shared pages, but we do mark
them as referenced. This relies on the fact that
no page is currently in both the page cache and the
buffer cache; we'd have to modify the following
test to allow for that case. */
switch (page->count) {
case 1:
/* If it has been referenced recently, don't free it */
if (clear_bit(PG_referenced, &page->flags)) {
/* age this page potential used */
if (priority < 4)
age_page(page);
break;
}
/* is it a page cache page? */
if (page->inode) {
remove_page_from_hash_queue(page);
remove_page_from_inode_queue(page);
__free_page(page);
return 1;
}
/* is it a buffer cache page? */
if (free_buf && bh && try_to_free_buffer(bh, &bh, 6))
return 1;
break;
default:
/* more than one users: we can't throw it away */
set_bit(PG_referenced, &page->flags);
/* fall through */
case 0:
/* nothing */
}
next:
page++;
clock++;
if (clock >= limit) {
clock = 0;
page = mem_map;
}
} while (count_max > 0 && count_min > 0);
return 0;
}
/*
* This is called from try_to_swap_out() when we try to get rid of some
* pages.. If we're unmapping the last occurrence of this page, we also
* free it from the page hash-queues etc, as we don't want to keep it
* in-core unnecessarily.
*/
unsigned long page_unuse(unsigned long page)
{
struct page * p = mem_map + MAP_NR(page);
int count = p->count;
if (count != 2)
return count;
if (!p->inode)
return count;
remove_page_from_hash_queue(p);
remove_page_from_inode_queue(p);
free_page(page);
return 1;
}
/*
* Update a page cache copy, when we're doing a "write()" system call
* See also "update_vm_cache()".
*/
void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
{
unsigned long offset, len;
offset = (pos & ~PAGE_MASK);
pos = pos & PAGE_MASK;
len = PAGE_SIZE - offset;
do {
struct page * page;
if (len > count)
len = count;
page = find_page(inode, pos);
if (page) {
wait_on_page(page);
memcpy((void *) (offset + page_address(page)), buf, len);
release_page(page);
}
count -= len;
buf += len;
len = PAGE_SIZE;
offset = 0;
pos += PAGE_SIZE;
} while (count);
}
static inline void add_to_page_cache(struct page * page,
struct inode * inode, unsigned long offset,
struct page **hash)
{
page->count++;
page->flags &= ~((1 << PG_uptodate) | (1 << PG_error));
page->offset = offset;
add_page_to_inode_queue(inode, page);
__add_page_to_hash_queue(page, hash);
}
/*
* Try to read ahead in the file. "page_cache" is a potentially free page
* that we could use for the cache (if it is 0 we can try to create one,
* this is all overlapped with the IO on the previous page finishing anyway)
*/
static unsigned long try_to_read_ahead(struct inode * inode, unsigned long offset, unsigned long page_cache)
{
struct page * page;
struct page ** hash;
offset &= PAGE_MASK;
switch (page_cache) {
case 0:
page_cache = __get_free_page(GFP_KERNEL);
if (!page_cache)
break;
default:
if (offset >= inode->i_size)
break;
hash = page_hash(inode, offset);
page = __find_page(inode, offset, *hash);
if (!page) {
/*
* Ok, add the new page to the hash-queues...
*/
page = mem_map + MAP_NR(page_cache);
add_to_page_cache(page, inode, offset, hash);
inode->i_op->readpage(inode, page);
page_cache = 0;
}
release_page(page);
}
return page_cache;
}
/*
* Wait for IO to complete on a locked page.
*
* This must be called with the caller "holding" the page,
* ie with increased "page->count" so that the page won't
* go away during the wait..
*/
void __wait_on_page(struct page *page)
{
struct wait_queue wait = { current, NULL };
add_wait_queue(&page->wait, &wait);
repeat:
run_task_queue(&tq_disk);
current->state = TASK_UNINTERRUPTIBLE;
if (PageLocked(page)) {
schedule();
goto repeat;
}
remove_wait_queue(&page->wait, &wait);
current->state = TASK_RUNNING;
}
#if 0
#define PROFILE_READAHEAD
#define DEBUG_READAHEAD
#endif
/*
* Read-ahead profiling information
* --------------------------------
* Every PROFILE_MAXREADCOUNT, the following information is written
* to the syslog:
* Percentage of asynchronous read-ahead.
* Average of read-ahead fields context value.
* If DEBUG_READAHEAD is defined, a snapshot of these fields is written
* to the syslog.
*/
#ifdef PROFILE_READAHEAD
#define PROFILE_MAXREADCOUNT 1000
static unsigned long total_reada;
static unsigned long total_async;
static unsigned long total_ramax;
static unsigned long total_ralen;
static unsigned long total_rawin;
static void profile_readahead(int async, struct file *filp)
{
unsigned long flags;
++total_reada;
if (async)
++total_async;
total_ramax += filp->f_ramax;
total_ralen += filp->f_ralen;
total_rawin += filp->f_rawin;
if (total_reada > PROFILE_MAXREADCOUNT) {
save_flags(flags);
cli();
if (!(total_reada > PROFILE_MAXREADCOUNT)) {
restore_flags(flags);
return;
}
printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
total_ramax/total_reada,
total_ralen/total_reada,
total_rawin/total_reada,
(total_async*100)/total_reada);
#ifdef DEBUG_READAHEAD
printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
#endif
total_reada = 0;
total_async = 0;
total_ramax = 0;
total_ralen = 0;
total_rawin = 0;
restore_flags(flags);
}
}
#endif /* defined PROFILE_READAHEAD */
/*
* Read-ahead context:
* -------------------
* The read ahead context fields of the "struct file" are the following:
* - f_raend : position of the first byte after the last page we tried to
* read ahead.
* - f_ramax : current read-ahead maximum size.
* - f_ralen : length of the current IO read block we tried to read-ahead.
* - f_rawin : length of the current read-ahead window.
* if last read-ahead was synchronous then
* f_rawin = f_ralen
* otherwise (was asynchronous)
* f_rawin = previous value of f_ralen + f_ralen
*
* Read-ahead limits:
* ------------------
* MIN_READAHEAD : minimum read-ahead size when read-ahead.
* MAX_READAHEAD : maximum read-ahead size when read-ahead.
*
* Synchronous read-ahead benefits:
* --------------------------------
* Using reasonable IO xfer length from peripheral devices increase system
* performances.
* Reasonable means, in this context, not too large but not too small.
* The actual maximum value is:
* MAX_READAHEAD + PAGE_SIZE = 76k is CONFIG_READA_SMALL is undefined
* and 32K if defined (4K page size assumed).
*
* Asynchronous read-ahead benefits:
* ---------------------------------
* Overlapping next read request and user process execution increase system
* performance.
*
* Read-ahead risks:
* -----------------
* We have to guess which further data are needed by the user process.
* If these data are often not really needed, it's bad for system
* performances.
* However, we know that files are often accessed sequentially by
* application programs and it seems that it is possible to have some good
* strategy in that guessing.
* We only try to read-ahead files that seems to be read sequentially.
*
* Asynchronous read-ahead risks:
* ------------------------------
* In order to maximize overlapping, we must start some asynchronous read
* request from the device, as soon as possible.
* We must be very careful about:
* - The number of effective pending IO read requests.
* ONE seems to be the only reasonable value.
* - The total memory pool usage for the file access stream.
* This maximum memory usage is implicitly 2 IO read chunks:
* 2*(MAX_READAHEAD + PAGE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
* 64k if defined (4K page size assumed).
*/
#define PageAlignSize(size) (((size) + PAGE_SIZE -1) & PAGE_MASK)
#ifdef CONFIG_READA_SMALL /* small readahead */
#define MAX_READAHEAD PageAlignSize(4096*7)
#define MIN_READAHEAD PageAlignSize(4096*2)
#else /* large readahead */
#define MAX_READAHEAD PageAlignSize(4096*18)
#define MIN_READAHEAD PageAlignSize(4096*3)
#endif
static inline unsigned long generic_file_readahead(int reada_ok, struct file * filp, struct inode * inode,
unsigned long ppos, struct page * page,
unsigned long page_cache)
{
unsigned long max_ahead, ahead;
unsigned long raend;
raend = filp->f_raend & PAGE_MASK;
max_ahead = 0;
/*
* The current page is locked.
* If the current position is inside the previous read IO request, do not
* try to reread previously read ahead pages.
* Otherwise decide or not to read ahead some pages synchronously.
* If we are not going to read ahead, set the read ahead context for this
* page only.
*/
if (PageLocked(page)) {
if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
raend = ppos;
if (raend < inode->i_size)
max_ahead = filp->f_ramax;
filp->f_rawin = 0;
filp->f_ralen = PAGE_SIZE;
if (!max_ahead) {
filp->f_raend = ppos + filp->f_ralen;
filp->f_rawin += filp->f_ralen;
}
}
}
/*
* The current page is not locked.
* If we were reading ahead and,
* if the current max read ahead size is not zero and,
* if the current position is inside the last read-ahead IO request,
* it is the moment to try to read ahead asynchronously.
* We will later force unplug device in order to force asynchronous read IO.
*/
else if (reada_ok && filp->f_ramax && raend >= PAGE_SIZE &&
ppos <= raend && ppos + filp->f_ralen >= raend) {
/*
* Add ONE page to max_ahead in order to try to have about the same IO max size
* as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_SIZE.
* Compute the position of the last page we have tried to read in order to
* begin to read ahead just at the next page.
*/
raend -= PAGE_SIZE;
if (raend < inode->i_size)
max_ahead = filp->f_ramax + PAGE_SIZE;
if (max_ahead) {
filp->f_rawin = filp->f_ralen;
filp->f_ralen = 0;
reada_ok = 2;
}
}
/*
* Try to read ahead pages.
* We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
* scheduler, will work enough for us to avoid too bad actuals IO requests.
*/
ahead = 0;
while (ahead < max_ahead) {
ahead += PAGE_SIZE;
page_cache = try_to_read_ahead(inode, raend + ahead, page_cache);
}
/*
* If we tried to read ahead some pages,
* If we tried to read ahead asynchronously,
* Try to force unplug of the device in order to start an asynchronous
* read IO request.
* Update the read-ahead context.
* Store the length of the current read-ahead window.
* Double the current max read ahead size.
* That heuristic avoid to do some large IO for files that are not really
* accessed sequentially.
*/
if (ahead) {
if (reada_ok == 2) {
run_task_queue(&tq_disk);
}
filp->f_ralen += ahead;
filp->f_rawin += filp->f_ralen;
filp->f_raend = raend + ahead + PAGE_SIZE;
filp->f_ramax += filp->f_ramax;
if (filp->f_ramax > MAX_READAHEAD)
filp->f_ramax = MAX_READAHEAD;
#ifdef PROFILE_READAHEAD
profile_readahead((reada_ok == 2), filp);
#endif
}
return page_cache;
}
/*
* This is a generic file read routine, and uses the
* inode->i_op->readpage() function for the actual low-level
* stuff.
*
* This is really ugly. But the goto's actually try to clarify some
* of the logic when it comes to error handling etc.
*/
int generic_file_read(struct inode * inode, struct file * filp, char * buf, int count)
{
int error, read;
unsigned long pos, ppos, page_cache;
int reada_ok;
error = 0;
read = 0;
page_cache = 0;
pos = filp->f_pos;
ppos = pos & PAGE_MASK;
/*
* If the current position is outside the previous read-ahead window,
* we reset the current read-ahead context and set read ahead max to zero
* (will be set to just needed value later),
* otherwise, we assume that the file accesses are sequential enough to
* continue read-ahead.
*/
if (ppos > filp->f_raend || ppos + filp->f_rawin < filp->f_raend) {
reada_ok = 0;
filp->f_raend = 0;
filp->f_ralen = 0;
filp->f_ramax = 0;
filp->f_rawin = 0;
} else {
reada_ok = 1;
}
/*
* Adjust the current value of read-ahead max.
* If the read operation stay in the first half page, force no readahead.
* Otherwise try to increase read ahead max just enough to do the read request.
* Then, at least MIN_READAHEAD if read ahead is ok,
* and at most MAX_READAHEAD in all cases.
*/
if (pos + count <= (PAGE_SIZE >> 1)) {
filp->f_ramax = 0;
} else {
unsigned long needed;
needed = ((pos + count) & PAGE_MASK) - ppos;
if (filp->f_ramax < needed)
filp->f_ramax = needed;
if (reada_ok && filp->f_ramax < MIN_READAHEAD)
filp->f_ramax = MIN_READAHEAD;
if (filp->f_ramax > MAX_READAHEAD)
filp->f_ramax = MAX_READAHEAD;
}
for (;;) {
struct page *page, **hash;
if (pos >= inode->i_size)
break;
/*
* Try to find the data in the page cache..
*/
hash = page_hash(inode, pos & PAGE_MASK);
page = __find_page(inode, pos & PAGE_MASK, *hash);
if (!page)
goto no_cached_page;
found_page:
/*
* Try to read ahead only if the current page is filled or being filled.
* Otherwise, if we were reading ahead, decrease max read ahead size to
* the minimum value.
* In this context, that seems to may happen only on some read error or if
* the page has been rewritten.
*/
if (PageUptodate(page) || PageLocked(page))
page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_MASK, page, page_cache);
else if (reada_ok && filp->f_ramax > MIN_READAHEAD)
filp->f_ramax = MIN_READAHEAD;
wait_on_page(page);
if (!PageUptodate(page))
goto page_read_error;
success:
/*
* Ok, we have the page, it's up-to-date and ok,
* so now we can finally copy it to user space...
*/
{
unsigned long offset, nr;
offset = pos & ~PAGE_MASK;
nr = PAGE_SIZE - offset;
if (nr > count)
nr = count;
if (nr > inode->i_size - pos)
nr = inode->i_size - pos;
memcpy_tofs(buf, (void *) (page_address(page) + offset), nr);
release_page(page);
buf += nr;
pos += nr;
read += nr;
count -= nr;
if (count) {
/*
* to prevent hogging the CPU on well-cached systems,
* schedule if needed, it's safe to do it here:
*/
if (need_resched)
schedule();
continue;
}
break;
}
no_cached_page:
/*
* Ok, it wasn't cached, so we need to create a new
* page..
*/
if (!page_cache) {
page_cache = __get_free_page(GFP_KERNEL);
/*
* That could have slept, so go around to the
* very beginning..
*/
if (page_cache)
continue;
error = -ENOMEM;
break;
}
/*
* Ok, add the new page to the hash-queues...
*/
page = mem_map + MAP_NR(page_cache);
page_cache = 0;
add_to_page_cache(page, inode, pos & PAGE_MASK, hash);
/*
* Error handling is tricky. If we get a read error,
* the cached page stays in the cache (but uptodate=0),
* and the next process that accesses it will try to
* re-read it. This is needed for NFS etc, where the
* identity of the reader can decide if we can read the
* page or not..
*/
/*
* We have to read the page.
* If we were reading ahead, we had previously tried to read this page,
* That means that the page has probably been removed from the cache before
* the application process needs it, or has been rewritten.
* Decrease max readahead size to the minimum value in that situation.
*/
if (reada_ok && filp->f_ramax > MIN_READAHEAD)
filp->f_ramax = MIN_READAHEAD;
error = inode->i_op->readpage(inode, page);
if (!error)
goto found_page;
release_page(page);
break;
page_read_error:
/*
* We found the page, but it wasn't up-to-date.
* Try to re-read it _once_. We do this synchronously,
* because this happens only if there were errors.
*/
error = inode->i_op->readpage(inode, page);
if (!error) {
wait_on_page(page);
if (PageUptodate(page) && !PageError(page))
goto success;
error = -EIO; /* Some unspecified error occurred.. */
}
release_page(page);
break;
}
filp->f_pos = pos;
filp->f_reada = 1;
if (page_cache)
free_page(page_cache);
UPDATE_ATIME(inode)
if (!read)
read = error;
return read;
}
/*
* Semantics for shared and private memory areas are different past the end
* of the file. A shared mapping past the last page of the file is an error
* and results in a SIGBUS, while a private mapping just maps in a zero page.
*
* The goto's are kind of ugly, but this streamlines the normal case of having
* it in the page cache, and handles the special cases reasonably without
* having a lot of duplicated code.
*/
static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
{
unsigned long offset;
struct page * page, **hash;
struct inode * inode = area->vm_inode;
unsigned long old_page, new_page;
new_page = 0;
offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
goto no_page;
/*
* Do we have something in the page cache already?
*/
hash = page_hash(inode, offset);
page = __find_page(inode, offset, *hash);
if (!page)
goto no_cached_page;
found_page:
/*
* Ok, found a page in the page cache, now we need to check
* that it's up-to-date. First check whether we'll need an
* extra page -- better to overlap the allocation with the I/O.
*/
if (no_share && !new_page) {
new_page = __get_free_page(GFP_KERNEL);
if (!new_page)
goto failure;
}
if (PageLocked(page))
goto page_locked_wait;
if (!PageUptodate(page))
goto page_read_error;
success:
/*
* Found the page, need to check sharing and possibly
* copy it over to another page..
*/
old_page = page_address(page);
if (!no_share) {
/*
* Ok, we can share the cached page directly.. Get rid
* of any potential extra pages.
*/
if (new_page)
free_page(new_page);
flush_page_to_ram(old_page);
return old_page;
}
/*
* No sharing ... copy to the new page.
*/
memcpy((void *) new_page, (void *) old_page, PAGE_SIZE);
flush_page_to_ram(new_page);
release_page(page);
return new_page;
no_cached_page:
new_page = __get_free_page(GFP_KERNEL);
if (!new_page)
goto no_page;
/*
* During getting the above page we might have slept,
* so we need to re-check the situation with the page
* cache.. The page we just got may be useful if we
* can't share, so don't get rid of it here.
*/
page = find_page(inode, offset);
if (page)
goto found_page;
/*
* Now, create a new page-cache page from the page we got
*/
page = mem_map + MAP_NR(new_page);
new_page = 0;
add_to_page_cache(page, inode, offset, hash);
if (inode->i_op->readpage(inode, page) != 0)
goto failure;
/*
* Do a very limited read-ahead if appropriate
*/
if (PageLocked(page))
new_page = try_to_read_ahead(inode, offset + PAGE_SIZE, 0);
goto found_page;
page_locked_wait:
__wait_on_page(page);
if (PageUptodate(page))
goto success;
page_read_error:
/*
* Umm, take care of errors if the page isn't up-to-date.
* Try to re-read it _once_. We do this synchronously,
* because there really aren't any performance issues here
* and we need to check for errors.
*/
if (inode->i_op->readpage(inode, page) != 0)
goto failure;
wait_on_page(page);
if (PageError(page))
goto failure;
if (PageUptodate(page))
goto success;
/*
* Uhhuh.. Things didn't work out. Return zero to tell the
* mm layer so, possibly freeing the page cache page first.
*/
failure:
release_page(page);
if (new_page)
free_page(new_page);
no_page:
return 0;
}
/*
* Tries to write a shared mapped page to its backing store. May return -EIO
* if the disk is full.
*/
static inline int do_write_page(struct inode * inode, struct file * file,
const char * page, unsigned long offset)
{
int old_fs, retval;
unsigned long size;
size = offset + PAGE_SIZE;
/* refuse to extend file size.. */
if (S_ISREG(inode->i_mode)) {
if (size > inode->i_size)
size = inode->i_size;
/* Ho humm.. We should have tested for this earlier */
if (size < offset)
return -EIO;
}
size -= offset;
old_fs = get_fs();
set_fs(KERNEL_DS);
retval = -EIO;
if (size == file->f_op->write(inode, file, (const char *) page, size))
retval = 0;
set_fs(old_fs);
return retval;
}
static int filemap_write_page(struct vm_area_struct * vma,
unsigned long offset,
unsigned long page)
{
int result;
struct file file;
struct inode * inode;
struct buffer_head * bh;
bh = mem_map[MAP_NR(page)].buffers;
if (bh) {
/* whee.. just mark the buffer heads dirty */
struct buffer_head * tmp = bh;
do {
mark_buffer_dirty(tmp, 0);
tmp = tmp->b_this_page;
} while (tmp != bh);
return 0;
}
inode = vma->vm_inode;
file.f_op = inode->i_op->default_file_ops;
if (!file.f_op->write)
return -EIO;
file.f_mode = 3;
file.f_flags = 0;
file.f_count = 1;
file.f_inode = inode;
file.f_pos = offset;
file.f_reada = 0;
down(&inode->i_sem);
result = do_write_page(inode, &file, (const char *) page, offset);
up(&inode->i_sem);
return result;
}
/*
* Swapping to a shared file: while we're busy writing out the page
* (and the page still exists in memory), we save the page information
* in the page table, so that "filemap_swapin()" can re-use the page
* immediately if it is called while we're busy swapping it out..
*
* Once we've written it all out, we mark the page entry "empty", which
* will result in a normal page-in (instead of a swap-in) from the now
* up-to-date disk file.
*/
int filemap_swapout(struct vm_area_struct * vma,
unsigned long offset,
pte_t *page_table)
{
int error;
unsigned long page = pte_page(*page_table);
unsigned long entry = SWP_ENTRY(SHM_SWP_TYPE, MAP_NR(page));
flush_cache_page(vma, (offset + vma->vm_start - vma->vm_offset));
set_pte(page_table, __pte(entry));
flush_tlb_page(vma, (offset + vma->vm_start - vma->vm_offset));
error = filemap_write_page(vma, offset, page);
if (pte_val(*page_table) == entry)
pte_clear(page_table);
return error;
}
/*
* filemap_swapin() is called only if we have something in the page
* tables that is non-zero (but not present), which we know to be the
* page index of a page that is busy being swapped out (see above).
* So we just use it directly..
*/
static pte_t filemap_swapin(struct vm_area_struct * vma,
unsigned long offset,
unsigned long entry)
{
unsigned long page = SWP_OFFSET(entry);
mem_map[page].count++;
page = (page << PAGE_SHIFT) + PAGE_OFFSET;
return mk_pte(page,vma->vm_page_prot);
}
static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
pte_t pte = *ptep;
unsigned long page;
int error;
if (pte_none(pte))
return 0;
if (!(flags & MS_INVALIDATE)) {
if (!pte_present(pte))
return 0;
if (!pte_dirty(pte))
return 0;
flush_page_to_ram(pte_page(pte));
flush_cache_page(vma, address);
set_pte(ptep, pte_mkclean(pte));
flush_tlb_page(vma, address);
page = pte_page(pte);
mem_map[MAP_NR(page)].count++;
} else {
flush_cache_page(vma, address);
pte_clear(ptep);
flush_tlb_page(vma, address);
if (!pte_present(pte)) {
swap_free(pte_val(pte));
return 0;
}
page = pte_page(pte);
if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
free_page(page);
return 0;
}
}
error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
free_page(page);
return error;
}
static inline int filemap_sync_pte_range(pmd_t * pmd,
unsigned long address, unsigned long size,
struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
{
pte_t * pte;
unsigned long end;
int error;
if (pmd_none(*pmd))
return 0;
if (pmd_bad(*pmd)) {
printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
pmd_clear(pmd);
return 0;
}
pte = pte_offset(pmd, address);
offset += address & PMD_MASK;
address &= ~PMD_MASK;
end = address + size;
if (end > PMD_SIZE)
end = PMD_SIZE;
error = 0;
do {
error |= filemap_sync_pte(pte, vma, address + offset, flags);
address += PAGE_SIZE;
pte++;
} while (address < end);
return error;
}
static inline int filemap_sync_pmd_range(pgd_t * pgd,
unsigned long address, unsigned long size,
struct vm_area_struct *vma, unsigned int flags)
{
pmd_t * pmd;
unsigned long offset, end;
int error;
if (pgd_none(*pgd))
return 0;
if (pgd_bad(*pgd)) {
printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
pgd_clear(pgd);
return 0;
}
pmd = pmd_offset(pgd, address);
offset = address & PGDIR_MASK;
address &= ~PGDIR_MASK;
end = address + size;
if (end > PGDIR_SIZE)
end = PGDIR_SIZE;
error = 0;
do {
error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address < end);
return error;
}
static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
size_t size, unsigned int flags)
{
pgd_t * dir;
unsigned long end = address + size;
int error = 0;
dir = pgd_offset(vma->vm_mm, address);
flush_cache_range(vma->vm_mm, end - size, end);
while (address < end) {
error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
address = (address + PGDIR_SIZE) & PGDIR_MASK;
dir++;
}
flush_tlb_range(vma->vm_mm, end - size, end);
return error;
}
/*
* This handles (potentially partial) area unmaps..
*/
static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
{
filemap_sync(vma, start, len, MS_ASYNC);
}
/*
* Shared mappings need to be able to do the right thing at
* close/unmap/sync. They will also use the private file as
* backing-store for swapping..
*/
static struct vm_operations_struct file_shared_mmap = {
NULL, /* no special open */
NULL, /* no special close */
filemap_unmap, /* unmap - we need to sync the pages */
NULL, /* no special protect */
filemap_sync, /* sync */
NULL, /* advise */
filemap_nopage, /* nopage */
NULL, /* wppage */
filemap_swapout, /* swapout */
filemap_swapin, /* swapin */
};
/*
* Private mappings just need to be able to load in the map.
*
* (This is actually used for shared mappings as well, if we
* know they can't ever get write permissions..)
*/
static struct vm_operations_struct file_private_mmap = {
NULL, /* open */
NULL, /* close */
NULL, /* unmap */
NULL, /* protect */
NULL, /* sync */
NULL, /* advise */
filemap_nopage, /* nopage */
NULL, /* wppage */
NULL, /* swapout */
NULL, /* swapin */
};
/* This is used for a general mmap of a disk file */
int generic_file_mmap(struct inode * inode, struct file * file, struct vm_area_struct * vma)
{
struct vm_operations_struct * ops;
if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
ops = &file_shared_mmap;
/* share_page() can only guarantee proper page sharing if
* the offsets are all page aligned. */
if (vma->vm_offset & (PAGE_SIZE - 1))
return -EINVAL;
} else {
ops = &file_private_mmap;
if (vma->vm_offset & (inode->i_sb->s_blocksize - 1))
return -EINVAL;
}
if (!inode->i_sb || !S_ISREG(inode->i_mode))
return -EACCES;
if (!inode->i_op || !inode->i_op->readpage)
return -ENOEXEC;
UPDATE_ATIME(inode)
vma->vm_inode = inode;
inode->i_count++;
vma->vm_ops = ops;
return 0;
}
/*
* The msync() system call.
*/
static int msync_interval(struct vm_area_struct * vma,
unsigned long start, unsigned long end, int flags)
{
if (vma->vm_inode && vma->vm_ops && vma->vm_ops->sync) {
int error;
error = vma->vm_ops->sync(vma, start, end-start, flags);
if (error)
return error;
if (flags & MS_SYNC)
return file_fsync(vma->vm_inode, NULL);
return 0;
}
return 0;
}
asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
{
unsigned long end;
struct vm_area_struct * vma;
int unmapped_error, error;
if (start & ~PAGE_MASK)
return -EINVAL;
len = (len + ~PAGE_MASK) & PAGE_MASK;
end = start + len;
if (end < start)
return -EINVAL;
if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
return -EINVAL;
if (end == start)
return 0;
/*
* If the interval [start,end) covers some unmapped address ranges,
* just ignore them, but return -EFAULT at the end.
*/
vma = find_vma(current->mm, start);
unmapped_error = 0;
for (;;) {
/* Still start < end. */
if (!vma)
return -EFAULT;
/* Here start < vma->vm_end. */
if (start < vma->vm_start) {
unmapped_error = -EFAULT;
start = vma->vm_start;
}
/* Here vma->vm_start <= start < vma->vm_end. */
if (end <= vma->vm_end) {
if (start < end) {
error = msync_interval(vma, start, end, flags);
if (error)
return error;
}
return unmapped_error;
}
/* Here vma->vm_start <= start < vma->vm_end < end. */
error = msync_interval(vma, start, vma->vm_end, flags);
if (error)
return error;
start = vma->vm_end;
vma = vma->vm_next;
}
}
1.1 or1k/rc203soc/sw/uClinux/mm/kmalloc.c
http://www.opencores.org/cvsweb.shtml/or1k/rc203soc/sw/uClinux/mm/kmalloc.c?rev=1.1&content-type=text/x-cvsweb-markup
Index: kmalloc.c
===================================================================
/*
* linux/mm/kmalloc.c
*
* Copyright (C) 1991, 1992 Linus Torvalds & Roger Wolff.
*
* Written by R.E. Wolff Sept/Oct '93.
*
*/
/*
* Modified by Alex Bligh (alex@c...) 4 Apr 1994 to use multiple
* pages. So for 'page' throughout, read 'area'.
*
* Largely rewritten.. Linus
*/
#include <linux/mm.h>
#include <linux/delay.h>
#include <linux/interrupt.h>
#include <asm/system.h>
#include <asm/dma.h>
/* Define this if you want slow routines that try to trip errors */
#undef SADISTIC_KMALLOC
/* Private flags. */
#define MF_USED 0xffaa0055
#define MF_DMA 0xff00aa55
#define MF_FREE 0x0055ffaa
/*
* Much care has gone into making these routines in this file reentrant.
*
* The fancy bookkeeping of nbytesmalloced and the like are only used to
* report them to the user (oooohhhhh, aaaaahhhhh....) are not
* protected by cli(). (If that goes wrong. So what?)
*
* These routines restore the interrupt status to allow calling with ints
* off.
*/
/*
* A block header. This is in front of every malloc-block, whether free or not.
*/
struct block_header {
unsigned long bh_flags;
union {
unsigned long ubh_length;
struct block_header *fbh_next;
} vp;
};
#define bh_length vp.ubh_length
#define bh_next vp.fbh_next
#define BH(p) ((struct block_header *)(p))
/*
* The page descriptor is at the front of every page that malloc has in use.
*/
struct page_descriptor {
struct page_descriptor *next;
struct block_header *firstfree;
int order;
int nfree;
};
#define PAGE_DESC(p) ((struct page_descriptor *)(((unsigned long)(p)) & PAGE_MASK))
/*
* A size descriptor describes a specific class of malloc sizes.
* Each class of sizes has its own freelist.
*/
struct size_descriptor {
struct page_descriptor *firstfree;
struct page_descriptor *dmafree; /* DMA-able memory */
int nblocks;
int nmallocs;
int nfrees;
int nbytesmalloced;
int npages;
unsigned long gfporder; /* number of pages in the area required */
};
/*
* For now it is unsafe to allocate bucket sizes between n and
* n-sizeof(page_descriptor) where n is PAGE_SIZE * any power of two
*
* The blocksize and sizes arrays _must_ match!
*/
#if PAGE_SIZE == 4096
static const unsigned int blocksize[] = {
32,
64,
128,
252,
508,
1020,
2040,
4096 - 16,
8192 - 16,
16384 - 16,
32768 - 16,
65536 - 16,
131072 - 16,
0
};
static struct size_descriptor sizes[] =
{
{NULL, NULL, 127, 0, 0, 0, 0, 0},
{NULL, NULL, 63, 0, 0, 0, 0, 0},
{NULL, NULL, 31, 0, 0, 0, 0, 0},
{NULL, NULL, 16, 0, 0, 0, 0, 0},
{NULL, NULL, 8, 0, 0, 0, 0, 0},
{NULL, NULL, 4, 0, 0, 0, 0, 0},
{NULL, NULL, 2, 0, 0, 0, 0, 0},
{NULL, NULL, 1, 0, 0, 0, 0, 0},
{NULL, NULL, 1, 0, 0, 0, 0, 1},
{NULL, NULL, 1, 0, 0, 0, 0, 2},
{NULL, NULL, 1, 0, 0, 0, 0, 3},
{NULL, NULL, 1, 0, 0, 0, 0, 4},
{NULL, NULL, 1, 0, 0, 0, 0, 5},
{NULL, NULL, 0, 0, 0, 0, 0, 0}
};
#elif PAGE_SIZE == 8192
static const unsigned int blocksize[] = {
64,
128,
248,
504,
1016,
2040,
4080,
8192 - 32,
16384 - 32,
32768 - 32,
65536 - 32,
131072 - 32,
262144 - 32,
0
};
struct size_descriptor sizes[] =
{
{NULL, NULL, 127, 0, 0, 0, 0, 0},
{NULL, NULL, 63, 0, 0, 0, 0, 0},
{NULL, NULL, 31, 0, 0, 0, 0, 0},
{NULL, NULL, 16, 0, 0, 0, 0, 0},
{NULL, NULL, 8, 0, 0, 0, 0, 0},
{NULL, NULL, 4, 0, 0, 0, 0, 0},
{NULL, NULL, 2, 0, 0, 0, 0, 0},
{NULL, NULL, 1, 0, 0, 0, 0, 0},
{NULL, NULL, 1, 0, 0, 0, 0, 1},
{NULL, NULL, 1, 0, 0, 0, 0, 2},
{NULL, NULL, 1, 0, 0, 0, 0, 3},
{NULL, NULL, 1, 0, 0, 0, 0, 4},
{NULL, NULL, 1, 0, 0, 0, 0, 5},
{NULL, NULL, 0, 0, 0, 0, 0, 0}
};
#else
#error you need to make a version for your pagesize
#endif
#define NBLOCKS(order) (sizes[order].nblocks)
#define BLOCKSIZE(order) (blocksize[order])
#define AREASIZE(order) (PAGE_SIZE<<(sizes[order].gfporder))
/*
* Create a small cache of page allocations: this helps a bit with
* those pesky 8kB+ allocations for NFS when we're temporarily
* out of memory..
*
* This is a _truly_ small cache, we just cache one single page
* order (for orders 0, 1 and 2, that is 4, 8 and 16kB on x86).
*/
#define MAX_CACHE_ORDER 3
struct page_descriptor * kmalloc_cache[MAX_CACHE_ORDER];
static inline struct page_descriptor * get_kmalloc_pages(unsigned long priority,
unsigned long order, int dma)
{
return (struct page_descriptor *) __get_free_pages(priority, order, dma);
}
static inline void free_kmalloc_pages(struct page_descriptor * page,
unsigned long order, int dma)
{
if (!dma && order < MAX_CACHE_ORDER) {
page = xchg(kmalloc_cache+order, page);
if (!page)
return;
}
free_pages((unsigned long) page, order);
}
long kmalloc_init(long start_mem, long end_mem)
{
int order;
/*
* Check the static info array. Things will blow up terribly if it's
* incorrect. This is a late "compile time" check.....
*/
for (order = 0; BLOCKSIZE(order); order++) {
if ((NBLOCKS(order) * BLOCKSIZE(order) + sizeof(struct page_descriptor)) >
AREASIZE(order)) {
printk("Cannot use %d bytes out of %d in order = %d block mallocs\n",
(int) (NBLOCKS(order) * BLOCKSIZE(order) +
sizeof(struct page_descriptor)),
(int) AREASIZE(order),
BLOCKSIZE(order));
panic("This only happens if someone messes with kmalloc");
}
}
return start_mem;
}
/*
* Ugh, this is ugly, but we want the default case to run
* straight through, which is why we have the ugly goto's
*/
void *kmalloc(size_t size, int priority)
{
unsigned long flags;
unsigned long type;
int order, dma;
struct block_header *p;
struct page_descriptor *page, **pg;
struct size_descriptor *bucket = sizes;
/* Get order */
order = 0;
{
unsigned int realsize = size + sizeof(struct block_header);
for (;;) {
int ordersize = BLOCKSIZE(order);
if (realsize <= ordersize)
break;
order++;
bucket++;
if (ordersize)
continue;
printk("kmalloc of too large a block (%d bytes).\n", (int) size);
return NULL;
}
}
dma = 0;
type = MF_USED;
pg = &bucket->firstfree;
if (priority & GFP_DMA) {
dma = 1;
type = MF_DMA;
pg = &bucket->dmafree;
}
priority &= GFP_LEVEL_MASK;
/* Sanity check... */
if (intr_count && priority != GFP_ATOMIC) {
static int count = 0;
if (++count < 5) {
printk("kmalloc called nonatomically from interrupt %p\n",
__builtin_return_address(0));
priority = GFP_ATOMIC;
}
}
save_flags(flags);
cli();
page = *pg;
if (!page)
goto no_bucket_page;
p = page->firstfree;
if (p->bh_flags != MF_FREE)
goto not_free_on_freelist;
found_it:
page->firstfree = p->bh_next;
page->nfree--;
if (!page->nfree)
*pg = page->next;
restore_flags(flags);
bucket->nmallocs++;
bucket->nbytesmalloced += size;
p->bh_flags = type; /* As of now this block is officially in use */
p->bh_length = size;
#ifdef SADISTIC_KMALLOC
memset(p+1, 0xf0, size);
#endif
return p + 1; /* Pointer arithmetic: increments past header */
no_bucket_page:
/*
* If we didn't find a page already allocated for this
* bucket size, we need to get one..
*
* This can be done with ints on: it is private to this invocation
*/
restore_flags(flags);
{
int i, sz;
/* sz is the size of the blocks we're dealing with */
sz = BLOCKSIZE(order);
page = get_kmalloc_pages(priority, bucket->gfporder, dma);
if (!page)
goto no_free_page;
found_cached_page:
bucket->npages++;
page->order = order;
/* Loop for all but last block: */
i = (page->nfree = bucket->nblocks) - 1;
p = BH(page + 1);
while (i > 0) {
i--;
p->bh_flags = MF_FREE;
p->bh_next = BH(((long) p) + sz);
p = p->bh_next;
}
/* Last block: */
p->bh_flags = MF_FREE;
p->bh_next = NULL;
p = BH(page+1);
}
/*
* Now we're going to muck with the "global" freelist
* for this size: this should be uninterruptible
*/
cli();
page->next = *pg;
*pg = page;
goto found_it;
no_free_page:
/*
* No free pages, check the kmalloc cache of
* pages to see if maybe we have something available
*/
if (!dma && order < MAX_CACHE_ORDER) {
page = xchg(kmalloc_cache+order, page);
if (page)
goto found_cached_page;
}
{
static unsigned long last = 0;
if (priority != GFP_BUFFER && priority != GFP_IO &&
(last + 10 * HZ < jiffies)) {
last = jiffies;
printk("Couldn't get a free page.....\n");
}
return NULL;
}
not_free_on_freelist:
restore_flags(flags);
printk("Problem: block on freelist at %08lx isn't free.\n", (long) p);
return NULL;
}
void kfree(void *__ptr)
{
int dma;
unsigned long flags;
unsigned int order;
struct page_descriptor *page, **pg;
struct size_descriptor *bucket;
if (!__ptr)
goto null_kfree;
#define ptr ((struct block_header *) __ptr)
page = PAGE_DESC(ptr);
__ptr = ptr - 1;
if (~PAGE_MASK & (unsigned long)page->next)
goto bad_order;
order = page->order;
if (order >= sizeof(sizes) / sizeof(sizes[0]))
goto bad_order;
bucket = sizes + order;
dma = 0;
pg = &bucket->firstfree;
if (ptr->bh_flags == MF_DMA) {
dma = 1;
ptr->bh_flags = MF_USED;
pg = &bucket->dmafree;
}
if (ptr->bh_flags != MF_USED)
goto bad_order;
ptr->bh_flags = MF_FREE; /* As of now this block is officially free */
#ifdef SADISTIC_KMALLOC
memset(ptr+1, 0xe0, ptr->bh_length);
#endif
save_flags(flags);
cli();
bucket->nfrees++;
bucket->nbytesmalloced -= ptr->bh_length;
ptr->bh_next = page->firstfree;
page->firstfree = ptr;
if (!page->nfree++) {
/* Page went from full to one free block: put it on the freelist. */
if (bucket->nblocks == 1)
goto free_page;
page->next = *pg;
*pg = page;
}
/* If page is completely free, free it */
if (page->nfree == bucket->nblocks) {
for (;;) {
struct page_descriptor *tmp = *pg;
if (!tmp)
goto not_on_freelist;
if (tmp == page)
break;
pg = &tmp->next;
}
*pg = page->next;
free_page:
bucket->npages--;
free_kmalloc_pages(page, bucket->gfporder, dma);
}
restore_flags(flags);
null_kfree:
return;
bad_order:
printk("kfree of non-kmalloced memory: %p, next= %p, order=%d\n",
ptr+1, page->next, page->order);
return;
not_on_freelist:
restore_flags(flags);
printk("Ooops. page %p doesn't show on freelist.\n", page);
}
1.1 or1k/rc203soc/sw/uClinux/mm/memory.c
http://www.opencores.org/cvsweb.shtml/or1k/rc203soc/sw/uClinux/mm/memory.c?rev=1.1&content-type=text/x-cvsweb-markup
Index: memory.c
===================================================================
/*
* linux/mm/memory.c
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*/
/*
* demand-loading started 01.12.91 - seems it is high on the list of
* things wanted, and it should be easy to implement. - Linus
*/
/*
* Ok, demand-loading was easy, shared pages a little bit tricker. Shared
* pages started 02.12.91, seems to work. - Linus.
*
* Tested sharing by executing about 30 /bin/sh: under the old kernel it
* would have taken more than the 6M I have free, but it worked well as
* far as I could see.
*
* Also corrected some "invalidate()"s - I wasn't doing enough of them.
*/
/*
* Real VM (paging to/from disk) started 18.12.91. Much more work and
* thought has to go into this. Oh, well..
* 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
* Found it. Everything seems to work now.
* 20.12.91 - Ok, making the swap-device changeable like the root.
*/
/*
* 05.04.94 - Multi-page memory management added for v1.1.
* Idea by Alex Bligh (alex@c...)
*/
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/head.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <asm/system.h>
#include <asm/segment.h>
#include <asm/pgtable.h>
#include <asm/string.h>
unsigned long high_memory = 0;
/*
* We special-case the C-O-W ZERO_PAGE, because it's such
* a common occurrence (no need to read the page to know
* that it's zero - better for the cache and memory subsystem).
*/
static inline void copy_page(unsigned long from, unsigned long to)
{
if (from == ZERO_PAGE) {
memset((void *) to, 0, PAGE_SIZE);
return;
}
memcpy((void *) to, (void *) from, PAGE_SIZE);
}
#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
mem_map_t * mem_map = NULL;
/*
* oom() prints a message (so that the user knows why the process died),
* and gives the process an untrappable SIGKILL.
*/
void oom(struct task_struct * task)
{
printk("\nOut of memory for %s.\n", task->comm);
task->sig->action[SIGKILL-1].sa_handler = NULL;
task->blocked &= ~(1<<(SIGKILL-1));
send_sig(SIGKILL,task,1);
}
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
*/
static inline void free_one_pmd(pmd_t * dir)
{
pte_t * pte;
if (pmd_none(*dir))
return;
if (pmd_bad(*dir)) {
printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
pmd_clear(dir);
return;
}
pte = pte_offset(dir, 0);
pmd_clear(dir);
pte_free(pte);
}
static inline void free_one_pgd(pgd_t * dir)
{
int j;
pmd_t * pmd;
if (pgd_none(*dir))
return;
if (pgd_bad(*dir)) {
printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
pgd_clear(dir);
return;
}
pmd = pmd_offset(dir, 0);
pgd_clear(dir);
for (j = 0; j < PTRS_PER_PMD ; j++)
free_one_pmd(pmd+j);
pmd_free(pmd);
}
/*
* This function clears all user-level page tables of a process - this
* is needed by execve(), so that old pages aren't in the way.
*/
void clear_page_tables(struct task_struct * tsk)
{
int i;
pgd_t * page_dir;
page_dir = tsk->mm->pgd;
if (!page_dir || page_dir == swapper_pg_dir) {
printk("%s trying to clear kernel page-directory: not good\n", tsk->comm);
return;
}
flush_cache_mm(tsk->mm);
for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
free_one_pgd(page_dir + i);
flush_tlb_mm(tsk->mm);
}
/*
* This function frees up all page tables of a process when it exits. It
* is the same as "clear_page_tables()", except it also changes the process'
* page table directory to the kernel page tables and then frees the old
* page table directory.
*/
void free_page_tables(struct mm_struct * mm)
{
int i;
pgd_t * page_dir;
page_dir = mm->pgd;
if (!page_dir || page_dir == swapper_pg_dir) {
printk("Trying to free kernel page-directory: not good\n");
return;
}
for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
free_one_pgd(page_dir + i);
pgd_free(page_dir);
}
int new_page_tables(struct task_struct * tsk)
{
pgd_t * page_dir, * new_pg;
if (!(new_pg = pgd_alloc()))
return -ENOMEM;
page_dir = pgd_offset(&init_mm, 0);
flush_cache_mm(tsk->mm);
memcpy(new_pg + USER_PTRS_PER_PGD, page_dir + USER_PTRS_PER_PGD,
(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof (pgd_t));
flush_tlb_mm(tsk->mm);
SET_PAGE_DIR(tsk, new_pg);
tsk->mm->pgd = new_pg;
return 0;
}
static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte, int cow)
{
pte_t pte = *old_pte;
unsigned long page_nr;
if (pte_none(pte))
return;
if (!pte_present(pte)) {
swap_duplicate(pte_val(pte));
set_pte(new_pte, pte);
return;
}
page_nr = MAP_NR(pte_page(pte));
if (page_nr >= MAP_NR(high_memory) || PageReserved(mem_map+page_nr)) {
set_pte(new_pte, pte);
return;
}
if (cow)
pte = pte_wrprotect(pte);
if (delete_from_swap_cache(page_nr))
pte = pte_mkdirty(pte);
set_pte(new_pte, pte_mkold(pte));
set_pte(old_pte, pte);
mem_map[page_nr].count++;
}
static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size, int cow)
{
pte_t * src_pte, * dst_pte;
unsigned long end;
if (pmd_none(*src_pmd))
return 0;
if (pmd_bad(*src_pmd)) {
printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
pmd_clear(src_pmd);
return 0;
}
src_pte = pte_offset(src_pmd, address);
if (pmd_none(*dst_pmd)) {
if (!pte_alloc(dst_pmd, 0))
return -ENOMEM;
}
dst_pte = pte_offset(dst_pmd, address);
address &= ~PMD_MASK;
end = address + size;
if (end >= PMD_SIZE)
end = PMD_SIZE;
do {
/* I would like to switch arguments here, to make it
* consistent with copy_xxx_range and memcpy syntax.
*/
copy_one_pte(src_pte++, dst_pte++, cow);
address += PAGE_SIZE;
} while (address < end);
return 0;
}
static inline int copy_pmd_range(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long address, unsigned long size, int cow)
{
pmd_t * src_pmd, * dst_pmd;
unsigned long end;
int error = 0;
if (pgd_none(*src_pgd))
return 0;
if (pgd_bad(*src_pgd)) {
printk("copy_pmd_range: bad pgd (%08lx)\n", pgd_val(*src_pgd));
pgd_clear(src_pgd);
return 0;
}
src_pmd = pmd_offset(src_pgd, address);
if (pgd_none(*dst_pgd)) {
if (!pmd_alloc(dst_pgd, 0))
return -ENOMEM;
}
dst_pmd = pmd_offset(dst_pgd, address);
address &= ~PGDIR_MASK;
end = address + size;
if (end > PGDIR_SIZE)
end = PGDIR_SIZE;
do {
error = copy_pte_range(dst_pmd++, src_pmd++, address, end - address, cow);
if (error)
break;
address = (address + PMD_SIZE) & PMD_MASK;
} while (address < end);
return error;
}
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
* covered by this vma.
*/
int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
{
pgd_t * src_pgd, * dst_pgd;
unsigned long address = vma->vm_start;
unsigned long end = vma->vm_end;
int error = 0, cow;
cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
src_pgd = pgd_offset(src, address);
dst_pgd = pgd_offset(dst, address);
flush_cache_range(src, vma->vm_start, vma->vm_end);
flush_cache_range(dst, vma->vm_start, vma->vm_end);
while (address < end) {
error = copy_pmd_range(dst_pgd++, src_pgd++, address, end - address, cow);
if (error)
break;
address = (address + PGDIR_SIZE) & PGDIR_MASK;
}
/* Note that the src ptes get c-o-w treatment, so they change too. */
flush_tlb_range(src, vma->vm_start, vma->vm_end);
flush_tlb_range(dst, vma->vm_start, vma->vm_end);
return error;
}
static inline void free_pte(pte_t page)
{
if (pte_present(page)) {
unsigned long addr = pte_page(page);
if (addr >= high_memory || PageReserved(mem_map+MAP_NR(addr)))
return;
free_page(addr);
if (current->mm->rss <= 0)
return;
current->mm->rss--;
return;
}
swap_free(pte_val(page));
}
static inline void forget_pte(pte_t page)
{
if (!pte_none(page)) {
printk("forget_pte: old mapping existed!\n");
free_pte(page);
}
}
static inline void zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
{
pte_t * pte;
if (pmd_none(*pmd))
return;
if (pmd_bad(*pmd)) {
printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
pmd_clear(pmd);
return;
}
pte = pte_offset(pmd, address);
address &= ~PMD_MASK;
if (address + size > PMD_SIZE)
size = PMD_SIZE - address;
size >>= PAGE_SHIFT;
for (;;) {
pte_t page;
if (!size)
break;
page = *pte;
pte++;
size--;
if (pte_none(page))
continue;
pte_clear(pte-1);
free_pte(page);
}
}
static inline void zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
{
pmd_t * pmd;
unsigned long end;
if (pgd_none(*dir))
return;
if (pgd_bad(*dir)) {
printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
pgd_clear(dir);
return;
}
pmd = pmd_offset(dir, address);
address &= ~PGDIR_MASK;
end = address + size;
if (end > PGDIR_SIZE)
end = PGDIR_SIZE;
do {
zap_pte_range(pmd, address, end - address);
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address < end);
}
/*
* remove user pages in a given range.
*/
int zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
{
pgd_t * dir;
unsigned long end = address + size;
dir = pgd_offset(mm, address);
flush_cache_range(mm, end - size, end);
while (address < end) {
zap_pmd_range(dir, address, end - address);
address = (address + PGDIR_SIZE) & PGDIR_MASK;
dir++;
}
flush_tlb_range(mm, end - size, end);
return 0;
}
static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
{
unsigned long end;
address &= ~PMD_MASK;
end = address + size;
if (end > PMD_SIZE)
end = PMD_SIZE;
do {
pte_t oldpage = *pte;
set_pte(pte, zero_pte);
forget_pte(oldpage);
address += PAGE_SIZE;
pte++;
} while (address < end);
}
static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
{
unsigned long end;
address &= ~PGDIR_MASK;
end = address + size;
if (end > PGDIR_SIZE)
end = PGDIR_SIZE;
do {
pte_t * pte = pte_alloc(pmd, address);
if (!pte)
return -ENOMEM;
zeromap_pte_range(pte, address, end - address, zero_pte);
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address < end);
return 0;
}
int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
{
int error = 0;
pgd_t * dir;
unsigned long beg = address;
unsigned long end = address + size;
pte_t zero_pte;
zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
dir = pgd_offset(current->mm, address);
flush_cache_range(current->mm, beg, end);
while (address < end) {
pmd_t *pmd = pmd_alloc(dir, address);
error = -ENOMEM;
if (!pmd)
break;
error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
if (error)
break;
address = (address + PGDIR_SIZE) & PGDIR_MASK;
dir++;
}
flush_tlb_range(current->mm, beg, end);
return error;
}
/*
* maps a range of physical memory into the requested pages. the old
* mappings are removed. any references to nonexistent pages results
* in null mappings (currently treated as "copy-on-access")
*/
static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
unsigned long offset, pgprot_t prot)
{
unsigned long end;
address &= ~PMD_MASK;
end = address + size;
if (end > PMD_SIZE)
end = PMD_SIZE;
do {
pte_t oldpage = *pte;
pte_clear(pte);
if (offset >= high_memory || PageReserved(mem_map+MAP_NR(offset)))
set_pte(pte, mk_pte(offset, prot));
forget_pte(oldpage);
address += PAGE_SIZE;
offset += PAGE_SIZE;
pte++;
} while (address < end);
}
static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
unsigned long offset, pgprot_t prot)
{
unsigned long end;
address &= ~PGDIR_MASK;
end = address + size;
if (end > PGDIR_SIZE)
end = PGDIR_SIZE;
offset -= address;
do {
pte_t * pte = pte_alloc(pmd, address);
if (!pte)
return -ENOMEM;
remap_pte_range(pte, address, end - address, address + offset, prot);
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address < end);
return 0;
}
int remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot)
{
int error = 0;
pgd_t * dir;
unsigned long beg = from;
unsigned long end = from + size;
offset -= from;
dir = pgd_offset(current->mm, from);
flush_cache_range(current->mm, beg, end);
while (from < end) {
pmd_t *pmd = pmd_alloc(dir, from);
error = -ENOMEM;
if (!pmd)
break;
error = remap_pmd_range(pmd, from, end - from, offset + from, prot);
if (error)
break;
from = (from + PGDIR_SIZE) & PGDIR_MASK;
dir++;
}
flush_tlb_range(current->mm, beg, end);
return error;
}
/*
* sanity-check function..
*/
static void put_page(pte_t * page_table, pte_t pte)
{
if (!pte_none(*page_table)) {
free_page(pte_page(pte));
return;
}
/* no need for flush_tlb */
set_pte(page_table, pte);
}
/*
* This routine is used to map in a page into an address space: needed by
* execve() for the initial stack and environment pages.
*/
unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
{
pgd_t * pgd;
pmd_t * pmd;
pte_t * pte;
if (page >= high_memory)
printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
if (mem_map[MAP_NR(page)].count != 1)
printk("mem_map disagrees with %08lx at %08lx\n",page,address);
pgd = pgd_offset(tsk->mm,address);
pmd = pmd_alloc(pgd, address);
if (!pmd) {
free_page(page);
oom(tsk);
return 0;
}
pte = pte_alloc(pmd, address);
if (!pte) {
free_page(page);
oom(tsk);
return 0;
}
if (!pte_none(*pte)) {
printk("put_dirty_page: page already exists\n");
free_page(page);
return 0;
}
flush_page_to_ram(page);
set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
/* no need for invalidate */
return page;
}
/*
* This routine handles present pages, when users try to write
* to a shared page. It is done by copying the page to a new address
* and decrementing the shared-page counter for the old page.
*
* Goto-purists beware: the only reason for goto's here is that it results
* in better assembly code.. The "default" path will see no jumps at all.
*
* Note that this routine assumes that the protection checks have been
* done by the caller (the low-level page fault routine in most cases).
* Thus we can safely just mark it writable once we've done any necessary
* COW.
*
* We also mark the page dirty at this point even though the page will
* change only once the write actually happens. This avoids a few races,
* and potentially makes it more efficient.
*/
void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
unsigned long address, int write_access)
{
pgd_t *page_dir;
pmd_t *page_middle;
pte_t *page_table, pte;
unsigned long old_page, new_page;
new_page = __get_free_page(GFP_KERNEL);
page_dir = pgd_offset(vma->vm_mm, address);
if (pgd_none(*page_dir))
goto end_wp_page;
if (pgd_bad(*page_dir))
goto bad_wp_pagedir;
page_middle = pmd_offset(page_dir, address);
if (pmd_none(*page_middle))
goto end_wp_page;
if (pmd_bad(*page_middle))
goto bad_wp_pagemiddle;
page_table = pte_offset(page_middle, address);
pte = *page_table;
if (!pte_present(pte))
goto end_wp_page;
if (pte_write(pte))
goto end_wp_page;
old_page = pte_page(pte);
if (old_page >= high_memory)
goto bad_wp_page;
tsk->min_flt++;
/*
* Do we need to copy?
*/
if (mem_map[MAP_NR(old_page)].count != 1) {
if (new_page) {
if (PageReserved(mem_map + MAP_NR(old_page)))
++vma->vm_mm->rss;
copy_page(old_page,new_page);
flush_page_to_ram(old_page);
flush_page_to_ram(new_page);
flush_cache_page(vma, address);
set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
free_page(old_page);
flush_tlb_page(vma, address);
return;
}
flush_cache_page(vma, address);
set_pte(page_table, BAD_PAGE);
flush_tlb_page(vma, address);
free_page(old_page);
oom(tsk);
return;
}
flush_cache_page(vma, address);
set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
flush_tlb_page(vma, address);
if (new_page)
free_page(new_page);
return;
bad_wp_page:
printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
send_sig(SIGKILL, tsk, 1);
goto end_wp_page;
bad_wp_pagemiddle:
printk("do_wp_page: bogus page-middle at address %08lx (%08lx)\n", address, pmd_val(*page_middle));
send_sig(SIGKILL, tsk, 1);
goto end_wp_page;
bad_wp_pagedir:
printk("do_wp_page: bogus page-dir entry at address %08lx (%08lx)\n", address, pgd_val(*page_dir));
send_sig(SIGKILL, tsk, 1);
end_wp_page:
if (new_page)
free_page(new_page);
return;
}
/*
* Ugly, ugly, but the goto's result in better assembly..
*/
int verify_area(int type, const void * addr, unsigned long size)
{
struct vm_area_struct * vma;
unsigned long start = (unsigned long) addr;
/* If the current user space is mapped to kernel space (for the
* case where we use a fake user buffer with get_fs/set_fs()) we
* don't expect to find the address in the user vm map.
*/
if (!size || get_fs() == KERNEL_DS)
return 0;
vma = find_vma(current->mm, start);
if (!vma)
goto bad_area;
if (vma->vm_start > start)
goto check_stack;
good_area:
if (type == VERIFY_WRITE)
goto check_write;
for (;;) {
struct vm_area_struct * next;
if (!(vma->vm_flags & VM_READ))
goto bad_area;
if (vma->vm_end - start >= size)
return 0;
next = vma->vm_next;
if (!next || vma->vm_end != next->vm_start)
goto bad_area;
vma = next;
}
check_write:
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
if (!wp_works_ok)
goto check_wp_fault_by_hand;
for (;;) {
if (vma->vm_end - start >= size)
break;
if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start)
goto bad_area;
vma = vma->vm_next;
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
}
return 0;
check_wp_fault_by_hand:
size--;
size += start & ~PAGE_MASK;
size >>= PAGE_SHIFT;
start &= PAGE_MASK;
for (;;) {
do_wp_page(current, vma, start, 1);
if (!size)
break;
size--;
start += PAGE_SIZE;
if (start < vma->vm_end)
continue;
vma = vma->vm_next;
if (!vma || vma->vm_start != start)
goto bad_area;
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;;
}
return 0;
check_stack:
if (!(vma->vm_flags & VM_GROWSDOWN))
goto bad_area;
if (expand_stack(vma, start) == 0)
goto good_area;
bad_area:
return -EFAULT;
}
/*
* This function zeroes out partial mmap'ed pages at truncation time..
*/
static void partial_clear(struct vm_area_struct *vma, unsigned long address)
{
pgd_t *page_dir;
pmd_t *page_middle;
pte_t *page_table, pte;
page_dir = pgd_offset(vma->vm_mm, address);
if (pgd_none(*page_dir))
return;
if (pgd_bad(*page_dir)) {
printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
pgd_clear(page_dir);
return;
}
page_middle = pmd_offset(page_dir, address);
if (pmd_none(*page_middle))
return;
if (pmd_bad(*page_middle)) {
printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
pmd_clear(page_middle);
return;
}
page_table = pte_offset(page_middle, address);
pte = *page_table;
if (!pte_present(pte))
return;
flush_cache_page(vma, address);
address &= ~PAGE_MASK;
address += pte_page(pte);
if (address >= high_memory)
return;
memset((void *) address, 0, PAGE_SIZE - (address & ~PAGE_MASK));
flush_page_to_ram(pte_page(pte));
}
/*
* Handle all mappings that got truncated by a "truncate()"
* system call.
*
* NOTE! We have to be ready to update the memory sharing
* between the file and the memory map for a potential last
* incomplete page. Ugly, but necessary.
*/
void vmtruncate(struct inode * inode, unsigned long offset)
{
struct vm_area_struct * mpnt;
truncate_inode_pages(inode, offset);
if (!inode->i_mmap)
return;
mpnt = inode->i_mmap;
do {
unsigned long start = mpnt->vm_start;
unsigned long len = mpnt->vm_end - start;
unsigned long diff;
/* mapping wholly truncated? */
if (mpnt->vm_offset >= offset) {
zap_page_range(mpnt->vm_mm, start, len);
continue;
}
/* mapping wholly unaffected? */
diff = offset - mpnt->vm_offset;
if (diff >= len)
continue;
/* Ok, partially affected.. */
start += diff;
len = (len - diff) & PAGE_MASK;
if (start & ~PAGE_MASK) {
partial_clear(mpnt, start);
start = (start + ~PAGE_MASK) & PAGE_MASK;
}
zap_page_range(mpnt->vm_mm, start, len);
} while ((mpnt = mpnt->vm_next_share) != inode->i_mmap);
}
static inline void do_swap_page(struct task_struct * tsk,
struct vm_area_struct * vma, unsigned long address,
pte_t * page_table, pte_t entry, int write_access)
{
pte_t page;
if (!vma->vm_ops || !vma->vm_ops->swapin) {
swap_in(tsk, vma, page_table, pte_val(entry), write_access);
flush_page_to_ram(pte_page(*page_table));
return;
}
page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
if (pte_val(*page_table) != pte_val(entry)) {
free_page(pte_page(page));
return;
}
if (mem_map[MAP_NR(pte_page(page))].count > 1 && !(vma->vm_flags & VM_SHARED))
page = pte_wrprotect(page);
++vma->vm_mm->rss;
++tsk->maj_flt;
flush_page_to_ram(pte_page(page));
set_pte(page_table, page);
return;
}
/*
* do_no_page() tries to create a new page mapping. It aggressively
* tries to share with existing pages, but makes a separate copy if
* the "write_access" parameter is true in order to avoid the next
* page fault.
*
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
*/
void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
unsigned long address, int write_access)
{
pgd_t * pgd;
pmd_t * pmd;
pte_t * page_table;
pte_t entry;
unsigned long page;
pgd = pgd_offset(tsk->mm, address);
pmd = pmd_alloc(pgd, address);
if (!pmd)
goto no_memory;
page_table = pte_alloc(pmd, address);
if (!page_table)
goto no_memory;
entry = *page_table;
if (pte_present(entry))
goto is_present;
if (!pte_none(entry))
goto swap_page;
address &= PAGE_MASK;
if (!vma->vm_ops || !vma->vm_ops->nopage)
goto anonymous_page;
/*
* The third argument is "no_share", which tells the low-level code
* to copy, not share the page even if sharing is possible. It's
* essentially an early COW detection
*/
page = vma->vm_ops->nopage(vma, address,
(vma->vm_flags & VM_SHARED)?0:write_access);
if (!page)
goto sigbus;
++tsk->maj_flt;
++vma->vm_mm->rss;
/*
* This silly early PAGE_DIRTY setting removes a race
* due to the bad i386 page protection. But it's valid
* for other architectures too.
*
* Note that if write_access is true, we either now have
* a exclusive copy of the page, or this is a shared mapping,
* so we can make it writable and dirty to avoid having to
* handle that later.
*/
flush_page_to_ram(page);
entry = mk_pte(page, vma->vm_page_prot);
if (write_access) {
entry = pte_mkwrite(pte_mkdirty(entry));
} else if (mem_map[MAP_NR(page)].count > 1 && !(vma->vm_flags & VM_SHARED))
entry = pte_wrprotect(entry);
put_page(page_table, entry);
/* no need to invalidate: a not-present page shouldn't be cached */
return;
anonymous_page:
entry = pte_wrprotect(mk_pte(ZERO_PAGE, vma->vm_page_prot));
if (write_access) {
unsigned long page = __get_free_page(GFP_KERNEL);
if (!page)
goto sigbus;
memset((void *) page, 0, PAGE_SIZE);
entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
vma->vm_mm->rss++;
tsk->min_flt++;
flush_page_to_ram(page);
}
put_page(page_table, entry);
return;
sigbus:
force_sig(SIGBUS, current);
put_page(page_table, BAD_PAGE);
/* no need to invalidate, wasn't present */
return;
swap_page:
do_swap_page(tsk, vma, address, page_table, entry, write_access);
return;
no_memory:
oom(tsk);
is_present:
return;
}
/*
* The above separate functions for the no-page and wp-page
* cases will go away (they mostly do the same thing anyway),
* and we'll instead use only a general "handle_mm_fault()".
*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
* RISC architectures). The early dirtying is also good on the i386.
*
* There is also a hook called "update_mmu_cache()" that architectures
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*/
static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long address,
int write_access, pte_t * pte)
{
if (!pte_present(*pte)) {
do_no_page(current, vma, address, write_access);
return;
}
set_pte(pte, pte_mkyoung(*pte));
flush_tlb_page(vma, address);
if (!write_access)
return;
if (pte_write(*pte)) {
set_pte(pte, pte_mkdirty(*pte));
flush_tlb_page(vma, address);
return;
}
do_wp_page(current, vma, address, write_access);
}
void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
int write_access)
{
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
pgd = pgd_offset(vma->vm_mm, address);
pmd = pmd_alloc(pgd, address);
if (!pmd)
goto no_memory;
pte = pte_alloc(pmd, address);
if (!pte)
goto no_memory;
handle_pte_fault(vma, address, write_access, pte);
update_mmu_cache(vma, address, *pte);
return;
no_memory:
oom(current);
}
1.1 or1k/rc203soc/sw/uClinux/mm/mlock.c
http://www.opencores.org/cvsweb.shtml/or1k/rc203soc/sw/uClinux/mm/mlock.c?rev=1.1&content-type=text/x-cvsweb-markup
Index: mlock.c
===================================================================
/*
* linux/mm/mlock.c
*
* (C) Copyright 1995 Linus Torvalds
*/
#include <linux/stat.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/shm.h>
#include <linux/errno.h>
#include <linux/mman.h>
#include <linux/string.h>
#include <linux/malloc.h>
#include <asm/segment.h>
#include <asm/system.h>
#include <asm/pgtable.h>
static inline int mlock_fixup_all(struct vm_area_struct * vma, int newflags)
{
vma->vm_flags = newflags;
return 0;
}
static inline int mlock_fixup_start(struct vm_area_struct * vma,
unsigned long end, int newflags)
{
struct vm_area_struct * n;
n = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
if (!n)
return -EAGAIN;
*n = *vma;
vma->vm_start = end;
n->vm_end = end;
vma->vm_offset += vma->vm_start - n->vm_start;
n->vm_flags = newflags;
if (n->vm_inode)
n->vm_inode->i_count++;
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
insert_vm_struct(current->mm, n);
return 0;
}
static inline int mlock_fixup_end(struct vm_area_struct * vma,
unsigned long start, int newflags)
{
struct vm_area_struct * n;
n = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
if (!n)
return -EAGAIN;
*n = *vma;
vma->vm_end = start;
n->vm_start = start;
n->vm_offset += n->vm_start - vma->vm_start;
n->vm_flags = newflags;
if (n->vm_inode)
n->vm_inode->i_count++;
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
insert_vm_struct(current->mm, n);
return 0;
}
static inline int mlock_fixup_middle(struct vm_area_struct * vma,
unsigned long start, unsigned long end, int newflags)
{
struct vm_area_struct * left, * right;
left = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
if (!left)
return -EAGAIN;
right = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
if (!right) {
kfree(left);
return -EAGAIN;
}
*left = *vma;
*right = *vma;
left->vm_end = start;
vma->vm_start = start;
vma->vm_end = end;
right->vm_start = end;
vma->vm_offset += vma->vm_start - left->vm_start;
right->vm_offset += right->vm_start - left->vm_start;
vma->vm_flags = newflags;
if (vma->vm_inode)
vma->vm_inode->i_count += 2;
if (vma->vm_ops && vma->vm_ops->open) {
vma->vm_ops->open(left);
vma->vm_ops->open(right);
}
insert_vm_struct(current->mm, left);
insert_vm_struct(current->mm, right);
return 0;
}
static int mlock_fixup(struct vm_area_struct * vma,
unsigned long start, unsigned long end, unsigned int newflags)
{
int pages, retval;
if (newflags == vma->vm_flags)
return 0;
if (start == vma->vm_start) {
if (end == vma->vm_end)
retval = mlock_fixup_all(vma, newflags);
else
retval = mlock_fixup_start(vma, end, newflags);
} else {
if (end == vma->vm_end)
retval = mlock_fixup_end(vma, start, newflags);
else
retval = mlock_fixup_middle(vma, start, end, newflags);
}
if (!retval) {
/* keep track of amount of locked VM */
pages = (end - start) >> PAGE_SHIFT;
if (!(newflags & VM_LOCKED))
pages = -pages;
vma->vm_mm->locked_vm += pages;
if ((newflags & VM_LOCKED) && (newflags & VM_READ))
while (start < end) {
int c = get_user((int *) start);
__asm__ __volatile__("": :"r" (c));
start += PAGE_SIZE;
}
}
return retval;
}
static int do_mlock(unsigned long start, size_t len, int on)
{
unsigned long nstart, end, tmp;
struct vm_area_struct * vma, * next;
int error;
if (!suser())
return -EPERM;
len = (len + ~PAGE_MASK) & PAGE_MASK;
end = start + len;
if (end < start)
return -EINVAL;
if (end == start)
return 0;
vma = find_vma(current->mm, start);
if (!vma || vma->vm_start > start)
return -ENOMEM;
for (nstart = start ; ; ) {
unsigned int newflags;
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
newflags = vma->vm_flags | VM_LOCKED;
if (!on)
newflags &= ~VM_LOCKED;
if (vma->vm_end >= end) {
error = mlock_fixup(vma, nstart, end, newflags);
break;
}
tmp = vma->vm_end;
next = vma->vm_next;
error = mlock_fixup(vma, nstart, tmp, newflags);
if (error)
break;
nstart = tmp;
vma = next;
if (!vma || vma->vm_start != nstart) {
error = -ENOMEM;
break;
}
}
merge_segments(current->mm, start, end);
return error;
}
asmlinkage int sys_mlock(unsigned long start, size_t len)
{
unsigned long locked;
unsigned long lock_limit;
len = (len + (start & ~PAGE_MASK) + ~PAGE_MASK) & PAGE_MASK;
start &= PAGE_MASK;
locked = len >> PAGE_SHIFT;
locked += current->mm->locked_vm;
lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;
lock_limit >>= PAGE_SHIFT;
/* check against resource limits */
if (locked > lock_limit)
return -ENOMEM;
/* we may lock at most half of physical memory... */
/* (this check is pretty bogus, but doesn't hurt) */
if (locked > (MAP_NR(high_memory) >> 1))
return -ENOMEM;
return do_mlock(start, len, 1);
}
asmlinkage int sys_munlock(unsigned long start, size_t len)
{
len = (len + (start & ~PAGE_MASK) + ~PAGE_MASK) & PAGE_MASK;
start &= PAGE_MASK;
return do_mlock(start, len, 0);
}
static int do_mlockall(int flags)
{
int error;
unsigned int def_flags;
struct vm_area_struct * vma;
if (!suser())
return -EPERM;
def_flags = 0;
if (flags & MCL_FUTURE)
def_flags = VM_LOCKED;
current->mm->def_flags = def_flags;
error = 0;
for (vma = current->mm->mmap; vma ; vma = vma->vm_next) {
unsigned int newflags;
newflags = vma->vm_flags | VM_LOCKED;
if (!(flags & MCL_CURRENT))
newflags &= ~VM_LOCKED;
error = mlock_fixup(vma, vma->vm_start, vma->vm_end, newflags);
if (error)
break;
}
merge_segments(current->mm, 0, TASK_SIZE);
return error;
}
asmlinkage int sys_mlockall(int flags)
{
unsigned long lock_limit;
if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
return -EINVAL;
lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;
lock_limit >>= PAGE_SHIFT;
if (current->mm->total_vm > lock_limit)
return -ENOMEM;
/* we may lock at most half of physical memory... */
/* (this check is pretty bogus, but doesn't hurt) */
if (current->mm->total_vm > (MAP_NR(high_memory) >> 1))
return -ENOMEM;
return do_mlockall(flags);
}
asmlinkage int sys_munlockall(void)
{
return do_mlockall(0);
}
1.1 or1k/rc203soc/sw/uClinux/mm/mmap.c
http://www.opencores.org/cvsweb.shtml/or1k/rc203soc/sw/uClinux/mm/mmap.c?rev=1.1&content-type=text/x-cvsweb-markup
Index: mmap.c
===================================================================
/*
* linux/mm/mmap.c
*
* Written by obz.
*/
#include <linux/stat.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/shm.h>
#include <linux/errno.h>
#include <linux/mman.h>
#include <linux/string.h>
#include <linux/malloc.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <asm/segment.h>
#include <asm/system.h>
#include <asm/pgtable.h>
/*
* description of effects of mapping type and prot in current implementation.
* this is due to the limited x86 page protection hardware. The expected
* behavior is in parens:
*
* map_type prot
* PROT_NONE PROT_READ PROT_WRITE PROT_EXEC
* MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
* w: (no) no w: (no) no w: (yes) yes w: (no) no
* x: (no) no x: (no) yes x: (no) yes x: (yes) yes
*
* MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
* w: (no) no w: (no) no w: (copy) copy w: (no) no
* x: (no) no x: (no) yes x: (no) yes x: (yes) yes
*
*/
pgprot_t protection_map[16] = {
__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
};
/*
* Check that a process has enough memory to allocate a
* new virtual mapping.
*/
int vm_enough_memory(long pages)
{
/*
* stupid algorithm to decide if we have enough memory: while
* simple, it hopefully works in most obvious cases.. Easy to
* fool it, but this should catch most mistakes.
*/
long freepages;
freepages = buffermem >> PAGE_SHIFT;
freepages += page_cache_size;
if (freepages <= (MAP_NR(high_memory) >> 4) + 48)
freepages >>= 1;
freepages += nr_free_pages;
freepages += nr_swap_pages;
freepages -= MAP_NR(high_memory) >> 4;
return freepages > pages;
}
asmlinkage unsigned long sys_brk(unsigned long brk)
{
unsigned long rlim;
unsigned long newbrk, oldbrk;
struct mm_struct *mm = current->mm;
if (brk < mm->end_code)
return mm->brk;
newbrk = PAGE_ALIGN(brk);
oldbrk = PAGE_ALIGN(mm->brk);
if (oldbrk == newbrk)
return mm->brk = brk;
/*
* Always allow shrinking brk
*/
if (brk <= mm->brk) {
mm->brk = brk;
do_munmap(newbrk, oldbrk-newbrk);
return brk;
}
/*
* Check against rlimit and stack..
*/
rlim = current->rlim[RLIMIT_DATA].rlim_cur;
if (rlim >= RLIM_INFINITY)
rlim = ~0;
if (brk - mm->end_code > rlim)
return mm->brk;
/*
* Check against existing mmap mappings.
*/
if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
return mm->brk;
/*
* Check if we have enough memory..
*/
if (!vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT))
return mm->brk;
/*
* Ok, looks good - let it rip.
*/
if(do_mmap(NULL, oldbrk, newbrk-oldbrk,
PROT_READ|PROT_WRITE|PROT_EXEC,
MAP_FIXED|MAP_PRIVATE, 0) != oldbrk)
return mm->brk;
return mm->brk = brk;
}
/*
* Combine the mmap "prot" and "flags" argument into one "vm_flags" used
* internally. Essentially, translate the "PROT_xxx" and "MAP_xxx" bits
* into "VM_xxx".
*/
static inline unsigned long vm_flags(unsigned long prot, unsigned long flags)
{
#define _trans(x,bit1,bit2) \
((bit1==bit2)?(x&bit1):(x&bit1)?bit2:0)
unsigned long prot_bits, flag_bits;
prot_bits =
_trans(prot, PROT_READ, VM_READ) |
_trans(prot, PROT_WRITE, VM_WRITE) |
_trans(prot, PROT_EXEC, VM_EXEC);
flag_bits =
_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN) |
_trans(flags, MAP_DENYWRITE, VM_DENYWRITE) |
_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE);
return prot_bits | flag_bits;
#undef _trans
}
unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags, unsigned long off)
{
struct mm_struct * mm = current->mm;
struct vm_area_struct * vma;
if ((len = PAGE_ALIGN(len)) == 0)
return addr;
if (len > MAX_USER_ADDR || addr > MAX_USER_ADDR-len)
return -EINVAL;
/* offset overflow? */
if (off + len < off)
return -EINVAL;
/* mlock MCL_FUTURE? */
if (mm->def_flags & VM_LOCKED) {
unsigned long locked = mm->locked_vm << PAGE_SHIFT;
locked += len;
if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
return -EAGAIN;
}
/*
* do simple checking here so the lower-level routines won't have
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
if (file != NULL) {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
if ((prot & PROT_WRITE) && !(file->f_mode & 2))
return -EACCES;
/*
* make sure there are no mandatory locks on the file.
*/
if (locks_verify_locked(file->f_inode))
return -EAGAIN;
/* cevans -- whoops another append-only file flaw */
if (IS_APPEND(file->f_inode) && (file->f_mode & 2))
return -EACCES;
/* fall through */
case MAP_PRIVATE:
if (!(file->f_mode & 1))
return -EACCES;
break;
default:
return -EINVAL;
}
if (flags & MAP_DENYWRITE) {
if (file->f_inode->i_writecount > 0)
return -ETXTBSY;
}
} else if ((flags & MAP_TYPE) != MAP_PRIVATE)
return -EINVAL;
/*
* obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
*/
if (flags & MAP_FIXED) {
if (addr & ~PAGE_MASK)
return -EINVAL;
} else {
addr = get_unmapped_area(addr, len);
if (!addr)
return -ENOMEM;
}
/*
* determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
*/
if (file && (!file->f_op || !file->f_op->mmap))
return -ENODEV;
vma = (struct vm_area_struct *)kmalloc(sizeof(struct vm_area_struct),
GFP_KERNEL);
if (!vma)
return -ENOMEM;
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags(prot,flags) | mm->def_flags;
if (file) {
if (file->f_mode & 1)
vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_SHARED) {
vma->vm_flags |= VM_SHARED | VM_MAYSHARE;
/*
* This looks strange, but when we don't have the file open
* for writing, we can demote the shared mapping to a simpler
* private mapping. That also takes care of a security hole
* with ptrace() writing to a shared mapping without write
* permissions.
*
* We leave the VM_MAYSHARE bit on, just to get correct output
* from /proc/xxx/maps..
*/
if (!(file->f_mode & 2))
vma->vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
}
} else
vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f];
vma->vm_ops = NULL;
vma->vm_offset = off;
vma->vm_inode = NULL;
vma->vm_pte = 0;
do_munmap(addr, len); /* Clear old maps */
/* Check against address space limit. */
if ((mm->total_vm << PAGE_SHIFT) + len
> current->rlim[RLIMIT_AS].rlim_cur) {
kfree(vma);
return -ENOMEM;
}
/* Private writable mapping? Check memory availability.. */
if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE) {
if (!vm_enough_memory(len >> PAGE_SHIFT)) {
kfree(vma);
return -ENOMEM;
}
}
if (file) {
int error = file->f_op->mmap(file->f_inode, file, vma);
if (error) {
kfree(vma);
return error;
}
}
flags = vma->vm_flags;
insert_vm_struct(mm, vma);
merge_segments(mm, vma->vm_start, vma->vm_end);
/* merge_segments might have merged our vma, so we can't use it any more */
mm->total_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED) {
unsigned long start = addr;
mm->locked_vm += len >> PAGE_SHIFT;
do {
char c = get_user((char *) start);
len -= PAGE_SIZE;
start += PAGE_SIZE;
__asm__ __volatile__("": :"r" (c));
} while (len > 0);
}
return addr;
}
/*
* Get an address range which is currently unmapped.
* For mmap() without MAP_FIXED and shmat() with addr=0.
* Return value 0 means ENOMEM.
*/
unsigned long get_unmapped_area(unsigned long addr, unsigned long len)
{
struct vm_area_struct * vmm;
if (len > MAX_USER_ADDR)
return 0;
if (!addr)
addr = MMAP_SEARCH_START;
addr = PAGE_ALIGN(addr);
for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
/* At this point: (!vmm || addr < vmm->vm_end). */
if (MAX_USER_ADDR - len < addr)
return 0;
if (!vmm || addr + len <= vmm->vm_start)
return addr;
addr = vmm->vm_end;
}
}
/*
* Searching a VMA in the linear list task->mm->mmap is horribly slow.
* Use an AVL (Adelson-Velskii and Landis) tree to speed up this search
* from O(n) to O(log n), where n is the number of VMAs of the task
* (typically around 6, but may reach 3000 in some cases).
* Written by Bruno Haible <haible@m...>.
*/
/* We keep the list and tree sorted by address. */
#define vm_avl_key vm_end
#define vm_avl_key_t unsigned long /* typeof(vma->avl_key) */
/*
* task->mm->mmap_avl is the AVL tree corresponding to task->mm->mmap
* or, more exactly, its root.
* A vm_area_struct has the following fields:
* vm_avl_left left son of a tree node
* vm_avl_right right son of a tree node
* vm_avl_height 1+max(heightof(left),heightof(right))
* The empty tree is represented as NULL.
*/
/* Since the trees are balanced, their height will never be large. */
#define avl_maxheight 41 /* why this? a small exercise */
#define heightof(tree) ((tree) == avl_empty ? 0 : (tree)->vm_avl_height)
/*
* Consistency and balancing rules:
* 1. tree->vm_avl_height == 1+max(heightof(tree->vm_avl_left),heightof(tree->vm_avl_right))
* 2. abs( heightof(tree->vm_avl_left) - heightof(tree->vm_avl_right) ) <= 1
* 3. foreach node in tree->vm_avl_left: node->vm_avl_key <= tree->vm_avl_key,
* foreach node in tree->vm_avl_right: node->vm_avl_key >= tree->vm_avl_key.
*/
/* Look up the nodes at the left and at the right of a given node. */
static inline void avl_neighbours (struct vm_area_struct * node, struct vm_area_struct * tree, struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right)
{
vm_avl_key_t key = node->vm_avl_key;
*to_the_left = *to_the_right = NULL;
for (;;) {
if (tree == avl_empty) {
printk("avl_neighbours: node not found in the tree\n");
return;
}
if (key == tree->vm_avl_key)
break;
if (key < tree->vm_avl_key) {
*to_the_right = tree;
tree = tree->vm_avl_left;
} else {
*to_the_left = tree;
tree = tree->vm_avl_right;
}
}
if (tree != node) {
printk("avl_neighbours: node not exactly found in the tree\n");
return;
}
if (tree->vm_avl_left != avl_empty) {
struct vm_area_struct * node;
for (node = tree->vm_avl_left; node->vm_avl_right != avl_empty; node = node->vm_avl_right)
continue;
*to_the_left = node;
}
if (tree->vm_avl_right != avl_empty) {
struct vm_area_struct * node;
for (node = tree->vm_avl_right; node->vm_avl_left != avl_empty; node = node->vm_avl_left)
continue;
*to_the_right = node;
}
if ((*to_the_left && ((*to_the_left)->vm_next != node)) || (node->vm_next != *to_the_right))
printk("avl_neighbours: tree inconsistent with list\n");
}
/*
* Rebalance a tree.
* After inserting or deleting a node of a tree we have a sequence of subtrees
* nodes[0]..nodes[k-1] such that
* nodes[0] is the root and nodes[i+1] = nodes[i]->{vm_avl_left|vm_avl_right}.
*/
static inline void avl_rebalance (struct vm_area_struct *** nodeplaces_ptr, int count)
{
for ( ; count > 0 ; count--) {
struct vm_area_struct ** nodeplace = *--nodeplaces_ptr;
struct vm_area_struct * node = *nodeplace;
struct vm_area_struct * nodeleft = node->vm_avl_left;
struct vm_area_struct * noderight = node->vm_avl_right;
int heightleft = heightof(nodeleft);
int heightright = heightof(noderight);
if (heightright + 1 < heightleft) {
/* */
/* * */
/* / \ */
/* n+2 n */
/* */
struct vm_area_struct * nodeleftleft = nodeleft->vm_avl_left;
struct vm_area_struct * nodeleftright = nodeleft->vm_avl_right;
int heightleftright = heightof(nodeleftright);
if (heightof(nodeleftleft) >= heightleftright) {
/* */
/* * n+2|n+3 */
/* / \ / \ */
/* n+2 n --> / n+1|n+2 */
/* / \ | / \ */
/* n+1 n|n+1 n+1 n|n+1 n */
/* */
node->vm_avl_left = nodeleftright; nodeleft->vm_avl_right = node;
nodeleft->vm_avl_height = 1 + (node->vm_avl_height = 1 + heightleftright);
*nodeplace = nodeleft;
} else {
/* */
/* * n+2 */
/* / \ / \ */
/* n+2 n --> n+1 n+1 */
/* / \ / \ / \ */
/* n n+1 n L R n */
/* / \ */
/* L R */
/* */
nodeleft->vm_avl_right = nodeleftright->vm_avl_left;
node->vm_avl_left = nodeleftright->vm_avl_right;
nodeleftright->vm_avl_left = nodeleft;
nodeleftright->vm_avl_right = node;
nodeleft->vm_avl_height = node->vm_avl_height = heightleftright;
nodeleftright->vm_avl_height = heightleft;
*nodeplace = nodeleftright;
}
}
else if (heightleft + 1 < heightright) {
/* similar to the above, just interchange 'left' <--> 'right' */
struct vm_area_struct * noderightright = noderight->vm_avl_right;
struct vm_area_struct * noderightleft = noderight->vm_avl_left;
int heightrightleft = heightof(noderightleft);
if (heightof(noderightright) >= heightrightleft) {
node->vm_avl_right = noderightleft; noderight->vm_avl_left = node;
noderight->vm_avl_height = 1 + (node->vm_avl_height = 1 + heightrightleft);
*nodeplace = noderight;
} else {
noderight->vm_avl_left = noderightleft->vm_avl_right;
node->vm_avl_right = noderightleft->vm_avl_left;
noderightleft->vm_avl_right = noderight;
noderightleft->vm_avl_left = node;
noderight->vm_avl_height = node->vm_avl_height = heightrightleft;
noderightleft->vm_avl_height = heightright;
*nodeplace = noderightleft;
}
}
else {
int height = (heightleft<heightright ? heightright : heightleft) + 1;
if (height == node->vm_avl_height)
break;
node->vm_avl_height = height;
}
}
}
/* Insert a node into a tree. */
static inline void avl_insert (struct vm_area_struct * new_node, struct vm_area_struct ** ptree)
{
vm_avl_key_t key = new_node->vm_avl_key;
struct vm_area_struct ** nodeplace = ptree;
struct vm_area_struct ** stack[avl_maxheight];
int stack_count = 0;
struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */
for (;;) {
struct vm_area_struct * node = *nodeplace;
if (node == avl_empty)
break;
*stack_ptr++ = nodeplace; stack_count++;
if (key < node->vm_avl_key)
nodeplace = &node->vm_avl_left;
else
nodeplace = &node->vm_avl_right;
}
new_node->vm_avl_left = avl_empty;
new_node->vm_avl_right = avl_empty;
new_node->vm_avl_height = 1;
*nodeplace = new_node;
avl_rebalance(stack_ptr,stack_count);
}
/* Insert a node into a tree, and
* return the node to the left of it and the node to the right of it.
*/
static inline void avl_insert_neighbours (struct vm_area_struct * new_node, struct vm_area_struct ** ptree,
struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right)
{
vm_avl_key_t key = new_node->vm_avl_key;
struct vm_area_struct ** nodeplace = ptree;
struct vm_area_struct ** stack[avl_maxheight];
int stack_count = 0;
struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */
*to_the_left = *to_the_right = NULL;
for (;;) {
struct vm_area_struct * node = *nodeplace;
if (node == avl_empty)
break;
*stack_ptr++ = nodeplace; stack_count++;
if (key < node->vm_avl_key) {
*to_the_right = node;
nodeplace = &node->vm_avl_left;
} else {
*to_the_left = node;
nodeplace = &node->vm_avl_right;
}
}
new_node->vm_avl_left = avl_empty;
new_node->vm_avl_right = avl_empty;
new_node->vm_avl_height = 1;
*nodeplace = new_node;
avl_rebalance(stack_ptr,stack_count);
}
/* Removes a node out of a tree. */
static inline void avl_remove (struct vm_area_struct * node_to_delete, struct vm_area_struct ** ptree)
{
vm_avl_key_t key = node_to_delete->vm_avl_key;
struct vm_area_struct ** nodeplace = ptree;
struct vm_area_struct ** stack[avl_maxheight];
int stack_count = 0;
struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */
struct vm_area_struct ** nodeplace_to_delete;
for (;;) {
struct vm_area_struct * node = *nodeplace;
if (node == avl_empty) {
/* what? node_to_delete not found in tree? */
printk("avl_remove: node to delete not found in tree\n");
return;
}
*stack_ptr++ = nodeplace; stack_count++;
if (key == node->vm_avl_key)
break;
if (key < node->vm_avl_key)
nodeplace = &node->vm_avl_left;
else
nodeplace = &node->vm_avl_right;
}
nodeplace_to_delete = nodeplace;
/* Have to remove node_to_delete = *nodeplace_to_delete. */
if (node_to_delete->vm_avl_left == avl_empty) {
*nodeplace_to_delete = node_to_delete->vm_avl_right;
stack_ptr--; stack_count--;
} else {
struct vm_area_struct *** stack_ptr_to_delete = stack_ptr;
struct vm_area_struct ** nodeplace = &node_to_delete->vm_avl_left;
struct vm_area_struct * node;
for (;;) {
node = *nodeplace;
if (node->vm_avl_right == avl_empty)
break;
*stack_ptr++ = nodeplace; stack_count++;
nodeplace = &node->vm_avl_right;
}
*nodeplace = node->vm_avl_left;
/* node replaces node_to_delete */
node->vm_avl_left = node_to_delete->vm_avl_left;
node->vm_avl_right = node_to_delete->vm_avl_right;
node->vm_avl_height = node_to_delete->vm_avl_height;
*nodeplace_to_delete = node; /* replace node_to_delete */
*stack_ptr_to_delete = &node->vm_avl_left; /* replace &node_to_delete->vm_avl_left */
}
avl_rebalance(stack_ptr,stack_count);
}
#ifdef DEBUG_AVL
/* print a list */
static void printk_list (struct vm_area_struct * vma)
{
printk("[");
while (vma) {
printk("%08lX-%08lX", vma->vm_start, vma->vm_end);
vma = vma->vm_next;
if (!vma)
break;
printk(" ");
}
printk("]");
}
/* print a tree */
static void printk_avl (struct vm_area_struct * tree)
{
if (tree != avl_empty) {
printk("(");
if (tree->vm_avl_left != avl_empty) {
printk_avl(tree->vm_avl_left);
printk("<");
}
printk("%08lX-%08lX", tree->vm_start, tree->vm_end);
if (tree->vm_avl_right != avl_empty) {
printk(">");
printk_avl(tree->vm_avl_right);
}
printk(")");
}
}
static char *avl_check_point = "somewhere";
/* check a tree's con |