Re: Securely removing the LDT restrictions (was: DOS4WG under dosemu)

Jamie Lokier (jamie@rebellion.co.uk)
Sat, 8 Jun 96 02:05 BST


Linus, a patch is enclosed preceded by plenty of explanation.

The idea is to securely allow LDT entries that cover any address range,
which appears to be required by some software running under dosemu.
Notably, DOS4/GW which is used by many popular games, and some graphics
software using DJGPP. Disregard or use as you wish. Ta.

Hans, I'm cross-posting this back to linux-kernel (and Linus) because it
includes a kernel patch. I'd like the kernel hackers to have a look at
it. I'm pretty sure its ok.

Note that without this, your emumodule hack to simply remove the LDT
restriction for root means that anyone can hack the kernel if they can
run dosemu. They just run the kernel-scrubbing DPMI client of their
choice.

Anyway, to the point:

I wrote this:

>> Ensure all page-directory entries for VM >= TASK_SIZE don't have
>> the _PAGE_USER bit set. Thus no ring 3 task can access any of
>> the kernel pages, (including dosemu and whatever is running
>> inside dosemu). This doesn't interfere with the other use of
>> _PAGE_USER, in ptes.

>>>>> "Hans" == Hans Lermen <lermen@elserv.ffm.fgan.de> writes:

Hans> You are right, this will work. ( but you will have _lots_ of
Hans> GPFs to handle when a DPMI client accesses such an area. We
Hans> have to take care about the performance too )

The client won't access that area unless it has a bug. If it does,
dosemu will call the client's fault handler and the client will die. No
performance problem there.

The only reason the client wants large LDT entries is so that code based
somewhere in the middle of memory can access low memory (for BIOS, DOS
and video) without using a selector, by "wrapping around" using a
large/negative pointer value.

Hans> However, this one is a pretty well known idea, Alexandre
Hans> Julliard (Wine) and others already proposed this. What makes
Hans> it difficult to realize is that we have to change the kernel's
Hans> paging stuff, and as this is the most critical part, Linus
Hans> will have to think it over and over again ( if he ever will
Hans> aggree ). Don't expect this for 2.0.

No, I don't expect it for 2.0. But Linus seems to have done most of the
work already -- there is already pte_alloc_kernel vs. pte_alloc, etc.
I've checked everywhere those functions are called, as well as
everywhere _PAGE_TABLE is used. pte_alloc_kernel is always called for
kernel page tables, and pte_alloc is always called for user page
tables.

All that remains is to have pte_alloc_kernel actually behave differently
from pte_alloc, and to set up the initial mappings without _PAGE_USER.

Enclosed is a patch against pre2.0.14 which does this, and removes the
restriction on the address range of LDT entries.

I have run the program which exploited the LDT hole in 1.3.66, and it
now crashes as expected. It sets up LDT entries (which it warns are
insecure -- they were then), but SEGVs when it tries to modify the
running kernel.

Now, this can go in the kernel, or it can be distributed with DOSEMU
with the following message: "To run DOS4/GW applications and maybe some
DJGPP games under dosemu, please apply the enclosed patch to your
otherwise perfect 2.0 kernel..." :-)

Or there is always "We recommend using version 2.1.0 or later of the
Linux kernel..." :-)

Unfortunately this is one thing that emumodule.o can't override.

-- Jamie

---start of patch---

--- linux/include/asm-i386/pgtable.h.orig Sat Jun 8 00:20:59 1996
+++ linux/include/asm-i386/pgtable.h Sat Jun 8 00:57:26 1996
@@ -214,7 +214,9 @@
#define _PAGE_DIRTY 0x040
#define _PAGE_4M 0x080 /* 4 MB page, Pentium+.. */

-#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _PAGE_TABLE_USER (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _PAGE_TABLE_KERNEL (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+
#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)

#define PAGE_NONE __pgprot(_PAGE_PRESENT | _PAGE_ACCESSED)
@@ -298,7 +300,7 @@
extern inline void pte_clear(pte_t *ptep) { pte_val(*ptep) = 0; }

extern inline int pmd_none(pmd_t pmd) { return !pmd_val(pmd); }
-extern inline int pmd_bad(pmd_t pmd) { return (pmd_val(pmd) & ~PAGE_MASK) != _PAGE_TABLE || pmd_val(pmd) > high_memory; }
+extern inline int pmd_bad(pmd_t pmd) { return (pmd_val(pmd) & ~(PAGE_MASK | _PAGE_USER)) != _PAGE_TABLE_KERNEL || pmd_val(pmd) > high_memory; }
extern inline int pmd_present(pmd_t pmd) { return pmd_val(pmd) & _PAGE_PRESENT; }
extern inline void pmd_clear(pmd_t * pmdp) { pmd_val(*pmdp) = 0; }

@@ -384,17 +386,17 @@
pte_t * page = (pte_t *) get_free_page(GFP_KERNEL);
if (pmd_none(*pmd)) {
if (page) {
- pmd_val(*pmd) = _PAGE_TABLE | (unsigned long) page;
+ pmd_val(*pmd) = _PAGE_TABLE_KERNEL | (unsigned long) page;
return page + address;
}
- pmd_val(*pmd) = _PAGE_TABLE | (unsigned long) BAD_PAGETABLE;
+ pmd_val(*pmd) = _PAGE_TABLE_KERNEL | (unsigned long) BAD_PAGETABLE;
return NULL;
}
free_page((unsigned long) page);
}
if (pmd_bad(*pmd)) {
printk("Bad pmd in pte_alloc: %08lx\n", pmd_val(*pmd));
- pmd_val(*pmd) = _PAGE_TABLE | (unsigned long) BAD_PAGETABLE;
+ pmd_val(*pmd) = _PAGE_TABLE_KERNEL | (unsigned long) BAD_PAGETABLE;
return NULL;
}
return (pte_t *) pmd_page(*pmd) + address;
@@ -426,17 +428,17 @@
pte_t * page = (pte_t *) get_free_page(GFP_KERNEL);
if (pmd_none(*pmd)) {
if (page) {
- pmd_val(*pmd) = _PAGE_TABLE | (unsigned long) page;
+ pmd_val(*pmd) = _PAGE_TABLE_USER | (unsigned long) page;
return page + address;
}
- pmd_val(*pmd) = _PAGE_TABLE | (unsigned long) BAD_PAGETABLE;
+ pmd_val(*pmd) = _PAGE_TABLE_USER | (unsigned long) BAD_PAGETABLE;
return NULL;
}
free_page((unsigned long) page);
}
if (pmd_bad(*pmd)) {
printk("Bad pmd in pte_alloc: %08lx\n", pmd_val(*pmd));
- pmd_val(*pmd) = _PAGE_TABLE | (unsigned long) BAD_PAGETABLE;
+ pmd_val(*pmd) = _PAGE_TABLE_USER | (unsigned long) BAD_PAGETABLE;
return NULL;
}
return (pte_t *) pmd_page(*pmd) + address;
--- linux/arch/i386/mm/init.c.orig Sat Jun 8 00:30:07 1996
+++ linux/arch/i386/mm/init.c Sat Jun 8 00:38:01 1996
@@ -172,8 +172,8 @@
: : :"ax");
#endif
wp_works_ok = 1;
- pgd_val(pg_dir[0]) = _PAGE_TABLE | _PAGE_4M | address;
- pgd_val(pg_dir[768]) = _PAGE_TABLE | _PAGE_4M | address;
+ pgd_val(pg_dir[0]) = _PAGE_TABLE_KERNEL | _PAGE_4M | address;
+ pgd_val(pg_dir[768]) = _PAGE_TABLE_KERNEL | _PAGE_4M | address;
pg_dir++;
address += 4*1024*1024;
continue;
@@ -187,8 +187,8 @@
}

/* also map it temporarily at 0x0000000 for init */
- pgd_val(pg_dir[0]) = _PAGE_TABLE | (unsigned long) pg_table;
- pgd_val(pg_dir[768]) = _PAGE_TABLE | (unsigned long) pg_table;
+ pgd_val(pg_dir[0]) = _PAGE_TABLE_KERNEL | (unsigned long) pg_table;
+ pgd_val(pg_dir[768]) = _PAGE_TABLE_KERNEL | (unsigned long) pg_table;
pg_dir++;
for (tmp = 0 ; tmp < PTRS_PER_PTE ; tmp++,pg_table++) {
if (address < end_mem)
--- linux/arch/i386/kernel/ldt.c.orig Sat Jun 8 00:07:12 1996
+++ linux/arch/i386/kernel/ldt.c Sat Jun 8 00:07:30 1996
@@ -34,31 +34,6 @@
return size;
}

-static inline int limits_ok(struct modify_ldt_ldt_s *ldt_info)
-{
- unsigned long base, limit;
- /* linear address of first and last accessible byte */
- unsigned long first, last;
-
- base = ldt_info->base_addr;
- limit = ldt_info->limit;
- if (ldt_info->limit_in_pages)
- limit = limit * PAGE_SIZE + PAGE_SIZE - 1;
-
- first = base;
- last = limit + base;
-
- /* segment grows down? */
- if (ldt_info->contents == 1) {
- /* data segment grows down */
- first = base+limit+1;
- last = base+65535;
- if (ldt_info->seg_32bit)
- last = base-1;
- }
- return (last >= first && last < TASK_SIZE);
-}
-
static int write_ldt(void * ptr, unsigned long bytecount)
{
struct modify_ldt_ldt_s ldt_info;
@@ -74,9 +49,6 @@
memcpy_fromfs(&ldt_info, ptr, sizeof(ldt_info));

if (ldt_info.contents == 3 || ldt_info.entry_number >= LDT_ENTRIES)
- return -EINVAL;
-
- if (!limits_ok(&ldt_info))
return -EINVAL;

if (!current->ldt) {
--- linux/arch/i386/kernel/head.S.orig Sat Jun 8 01:18:29 1996
+++ linux/arch/i386/kernel/head.S Sat Jun 8 01:17:55 1996
@@ -289,13 +289,13 @@
movl $ SYMBOL_NAME(swapper_pg_dir),%edi /* swapper_pg_dir is at 0x1000 */
cld;rep;stosl
/* Identity-map the kernel in low 4MB memory for ease of transition */
-/* set present bit/user r/w */
- movl $ SYMBOL_NAME(pg0)+7,SYMBOL_NAME(swapper_pg_dir)
+/* set present bit/supervisor r/w (not user) */
+ movl $ SYMBOL_NAME(pg0)+3,SYMBOL_NAME(swapper_pg_dir)
/* But the real place is at 0xC0000000 */
/* set present bit/user r/w */
- movl $ SYMBOL_NAME(pg0)+7,SYMBOL_NAME(swapper_pg_dir)+3072
+ movl $ SYMBOL_NAME(pg0)+3,SYMBOL_NAME(swapper_pg_dir)+3072
movl $ SYMBOL_NAME(pg0)+4092,%edi
- movl $0x03ff007,%eax /* 4Mb - 4096 + 7 (r/w user,p) */
+ movl $0x03ff003,%eax /* 4Mb - 4096 + 3 (r/w (not user),p) */
std
1: stosl /* fill the page backwards - more efficient :-) */
subl $0x1000,%eax

---end of patch---