Pull mem-attribute into release branch
diff --git a/Documentation/ia64/aliasing-test.c b/Documentation/ia64/aliasing-test.c
new file mode 100644
index 0000000..3153167
--- /dev/null
+++ b/Documentation/ia64/aliasing-test.c
@@ -0,0 +1,247 @@
+/*
+ * Exercise /dev/mem mmap cases that have been troublesome in the past
+ *
+ * (c) Copyright 2007 Hewlett-Packard Development Company, L.P.
+ *	Bjorn Helgaas <bjorn.helgaas@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <fnmatch.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+int sum;
+
+int map_mem(char *path, off_t offset, size_t length, int touch)
+{
+	int fd, rc;
+	void *addr;
+	int *c;
+
+	fd = open(path, O_RDWR);
+	if (fd == -1) {
+		perror(path);
+		return -1;
+	}
+
+	addr = mmap(NULL, length, PROT_READ|PROT_WRITE, MAP_SHARED, fd, offset);
+	if (addr == MAP_FAILED)
+		return 1;
+
+	if (touch) {
+		c = (int *) addr;
+		while (c < (int *) (offset + length))
+			sum += *c++;
+	}
+
+	rc = munmap(addr, length);
+	if (rc == -1) {
+		perror("munmap");
+		return -1;
+	}
+
+	close(fd);
+	return 0;
+}
+
+int scan_sysfs(char *path, char *file, off_t offset, size_t length, int touch)
+{
+	struct dirent **namelist;
+	char *name, *path2;
+	int i, n, r, rc, result = 0;
+	struct stat buf;
+
+	n = scandir(path, &namelist, 0, alphasort);
+	if (n < 0) {
+		perror("scandir");
+		return -1;
+	}
+
+	for (i = 0; i < n; i++) {
+		name = namelist[i]->d_name;
+
+		if (fnmatch(".", name, 0) == 0)
+			goto skip;
+		if (fnmatch("..", name, 0) == 0)
+			goto skip;
+
+		path2 = malloc(strlen(path) + strlen(name) + 3);
+		strcpy(path2, path);
+		strcat(path2, "/");
+		strcat(path2, name);
+
+		if (fnmatch(file, name, 0) == 0) {
+			rc = map_mem(path2, offset, length, touch);
+			if (rc == 0)
+				fprintf(stderr, "PASS: %s 0x%lx-0x%lx is %s\n", path2, offset, offset + length, touch ? "readable" : "mappable");
+			else if (rc > 0)
+				fprintf(stderr, "PASS: %s 0x%lx-0x%lx not mappable\n", path2, offset, offset + length);
+			else {
+				fprintf(stderr, "FAIL: %s 0x%lx-0x%lx not accessible\n", path2, offset, offset + length);
+				return rc;
+			}
+		} else {
+			r = lstat(path2, &buf);
+			if (r == 0 && S_ISDIR(buf.st_mode)) {
+				rc = scan_sysfs(path2, file, offset, length, touch);
+				if (rc < 0)
+					return rc;
+			}
+		}
+
+		result |= rc;
+		free(path2);
+
+skip:
+		free(namelist[i]);
+	}
+	free(namelist);
+	return rc;
+}
+
+char buf[1024];
+
+int read_rom(char *path)
+{
+	int fd, rc;
+	size_t size = 0;
+
+	fd = open(path, O_RDWR);
+	if (fd == -1) {
+		perror(path);
+		return -1;
+	}
+
+	rc = write(fd, "1", 2);
+	if (rc <= 0) {
+		perror("write");
+		return -1;
+	}
+
+	do {
+		rc = read(fd, buf, sizeof(buf));
+		if (rc > 0)
+			size += rc;
+	} while (rc > 0);
+
+	close(fd);
+	return size;
+}
+
+int scan_rom(char *path, char *file)
+{
+	struct dirent **namelist;
+	char *name, *path2;
+	int i, n, r, rc, result = 0;
+	struct stat buf;
+
+	n = scandir(path, &namelist, 0, alphasort);
+	if (n < 0) {
+		perror("scandir");
+		return -1;
+	}
+
+	for (i = 0; i < n; i++) {
+		name = namelist[i]->d_name;
+
+		if (fnmatch(".", name, 0) == 0)
+			goto skip;
+		if (fnmatch("..", name, 0) == 0)
+			goto skip;
+
+		path2 = malloc(strlen(path) + strlen(name) + 3);
+		strcpy(path2, path);
+		strcat(path2, "/");
+		strcat(path2, name);
+
+		if (fnmatch(file, name, 0) == 0) {
+			rc = read_rom(path2);
+
+			/*
+			 * It's OK if the ROM is unreadable.  Maybe there
+			 * is no ROM, or some other error ocurred.  The
+			 * important thing is that no MCA happened.
+			 */
+			if (rc > 0)
+				fprintf(stderr, "PASS: %s read %ld bytes\n", path2, rc);
+			else {
+				fprintf(stderr, "PASS: %s not readable\n", path2);
+				return rc;
+			}
+		} else {
+			r = lstat(path2, &buf);
+			if (r == 0 && S_ISDIR(buf.st_mode)) {
+				rc = scan_rom(path2, file);
+				if (rc < 0)
+					return rc;
+			}
+		}
+
+		result |= rc;
+		free(path2);
+
+skip:
+		free(namelist[i]);
+	}
+	free(namelist);
+	return rc;
+}
+
+main()
+{
+	int rc;
+
+	if (map_mem("/dev/mem", 0, 0xA0000, 1) == 0)
+		fprintf(stderr, "PASS: /dev/mem 0x0-0xa0000 is readable\n");
+	else
+		fprintf(stderr, "FAIL: /dev/mem 0x0-0xa0000 not accessible\n");
+
+	/*
+	 * It's not safe to blindly read the VGA frame buffer.  If you know
+	 * how to poke the card the right way, it should respond, but it's
+	 * not safe in general.  Many machines, e.g., Intel chipsets, cover
+	 * up a non-responding card by just returning -1, but others will
+	 * report the failure as a machine check.
+	 */
+	if (map_mem("/dev/mem", 0xA0000, 0x20000, 0) == 0)
+		fprintf(stderr, "PASS: /dev/mem 0xa0000-0xc0000 is mappable\n");
+	else
+		fprintf(stderr, "FAIL: /dev/mem 0xa0000-0xc0000 not accessible\n");
+
+	if (map_mem("/dev/mem", 0xC0000, 0x40000, 1) == 0)
+		fprintf(stderr, "PASS: /dev/mem 0xc0000-0x100000 is readable\n");
+	else
+		fprintf(stderr, "FAIL: /dev/mem 0xc0000-0x100000 not accessible\n");
+
+	/*
+	 * Often you can map all the individual pieces above (0-0xA0000,
+	 * 0xA0000-0xC0000, and 0xC0000-0x100000), but can't map the whole
+	 * thing at once.  This is because the individual pieces use different
+	 * attributes, and there's no single attribute supported over the
+	 * whole region.
+	 */
+	rc = map_mem("/dev/mem", 0, 1024*1024, 0);
+	if (rc == 0)
+		fprintf(stderr, "PASS: /dev/mem 0x0-0x100000 is mappable\n");
+	else if (rc > 0)
+		fprintf(stderr, "PASS: /dev/mem 0x0-0x100000 not mappable\n");
+	else
+		fprintf(stderr, "FAIL: /dev/mem 0x0-0x100000 not accessible\n");
+
+	scan_sysfs("/sys/class/pci_bus", "legacy_mem", 0, 0xA0000, 1);
+	scan_sysfs("/sys/class/pci_bus", "legacy_mem", 0xA0000, 0x20000, 0);
+	scan_sysfs("/sys/class/pci_bus", "legacy_mem", 0xC0000, 0x40000, 1);
+	scan_sysfs("/sys/class/pci_bus", "legacy_mem", 0, 1024*1024, 0);
+
+	scan_rom("/sys/devices", "rom");
+}
diff --git a/Documentation/ia64/aliasing.txt b/Documentation/ia64/aliasing.txt
index 38f9a52..9a431a7 100644
--- a/Documentation/ia64/aliasing.txt
+++ b/Documentation/ia64/aliasing.txt
@@ -112,16 +112,6 @@
 
 	The /dev/mem mmap constraints apply.
 
-	However, since this is for mapping legacy MMIO space, WB access
-	does not make sense.  This matters on machines without legacy
-	VGA support: these machines may have WB memory for the entire
-	first megabyte (or even the entire first granule).
-
-	On these machines, we could mmap legacy_mem as WB, which would
-	be safe in terms of attribute aliasing, but X has no way of
-	knowing that it is accessing regular memory, not a frame buffer,
-	so the kernel should fail the mmap rather than doing it with WB.
-
     read/write of /dev/mem
 
 	This uses copy_from_user(), which implicitly uses a kernel
@@ -138,14 +128,20 @@
 
     ioremap()
 
-	This returns a kernel identity mapping for use inside the
-	kernel.
+	This returns a mapping for use inside the kernel.
 
 	If the region is in kern_memmap, we should use the attribute
-	specified there.  Otherwise, if the EFI memory map reports that
-	the entire granule supports WB, we should use that (granules
-	that are partially reserved or occupied by firmware do not appear
-	in kern_memmap).  Otherwise, we should use a UC mapping.
+	specified there.
+
+	If the EFI memory map reports that the entire granule supports
+	WB, we should use that (granules that are partially reserved
+	or occupied by firmware do not appear in kern_memmap).
+
+	If the granule contains non-WB memory, but we can cover the
+	region safely with kernel page table mappings, we can use
+	ioremap_page_range() as most other architectures do.
+
+	Failing all of the above, we have to fall back to a UC mapping.
 
 PAST PROBLEM CASES
 
@@ -158,7 +154,7 @@
       succeed.  It may create either WB or UC user mappings, depending
       on whether the region is in kern_memmap or the EFI memory map.
 
-    mmap of 0x0-0xA0000 /dev/mem by "hwinfo" on HP sx1000 with VGA enabled
+    mmap of 0x0-0x9FFFF /dev/mem by "hwinfo" on HP sx1000 with VGA enabled
 
       See https://bugzilla.novell.com/show_bug.cgi?id=140858.
 
@@ -171,28 +167,25 @@
       so it is safe to use WB mappings.
 
       The kernel VGA driver may ioremap the VGA frame buffer at 0xA0000,
-      which will use a granule-sized UC mapping covering 0-0xFFFFF.  This
-      granule covers some WB-only memory, but since UC is non-speculative,
-      the processor will never generate an uncacheable reference to the
-      WB-only areas unless the driver explicitly touches them.
+      which uses a granule-sized UC mapping.  This granule will cover some
+      WB-only memory, but since UC is non-speculative, the processor will
+      never generate an uncacheable reference to the WB-only areas unless
+      the driver explicitly touches them.
 
     mmap of 0x0-0xFFFFF legacy_mem by "X"
 
-      If the EFI memory map reports this entire range as WB, there
-      is no VGA MMIO hole, and the mmap should fail or be done with
-      a WB mapping.
+      If the EFI memory map reports that the entire range supports the
+      same attributes, we can allow the mmap (and we will prefer WB if
+      supported, as is the case with HP sx[12]000 machines with VGA
+      disabled).
 
-      There's no easy way for X to determine whether the 0xA0000-0xBFFFF
-      region is a frame buffer or just memory, so I think it's best to
-      just fail this mmap request rather than using a WB mapping.  As
-      far as I know, there's no need to map legacy_mem with WB
-      mappings.
+      If EFI reports the range as partly WB and partly UC (as on sx[12]000
+      machines with VGA enabled), we must fail the mmap because there's no
+      safe attribute to use.
 
-      Otherwise, a UC mapping of the entire region is probably safe.
-      The VGA hole means the region will not be in kern_memmap.  The
-      HP sx1000 chipset doesn't support UC access to the memory surrounding
-      the VGA hole, but X doesn't need that area anyway and should not
-      reference it.
+      If EFI reports some of the range but not all (as on Intel firmware
+      that doesn't report the VGA frame buffer at all), we should fail the
+      mmap and force the user to map just the specific region of interest.
 
     mmap of 0xA0000-0xBFFFF legacy_mem by "X" on HP sx1000 with VGA disabled
 
@@ -202,6 +195,16 @@
       This is a special case of the previous case, and the mmap should
       fail for the same reason as above.
 
+    read of /sys/devices/.../rom
+
+      For VGA devices, this may cause an ioremap() of 0xC0000.  This
+      used to be done with a UC mapping, because the VGA frame buffer
+      at 0xA0000 prevents use of a WB granule.  The UC mapping causes
+      an MCA on HP sx[12]000 chipsets.
+
+      We should use WB page table mappings to avoid covering the VGA
+      frame buffer.
+
 NOTES
 
     [1] SDM rev 2.2, vol 2, sec 4.4.1.
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index f45f91d..78d29b7 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -660,6 +660,29 @@
 	return NULL;
 }
 
+static int
+efi_memmap_intersects (unsigned long phys_addr, unsigned long size)
+{
+	void *efi_map_start, *efi_map_end, *p;
+	efi_memory_desc_t *md;
+	u64 efi_desc_size;
+	unsigned long end;
+
+	efi_map_start = __va(ia64_boot_param->efi_memmap);
+	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
+	efi_desc_size = ia64_boot_param->efi_memdesc_size;
+
+	end = phys_addr + size;
+
+	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
+		md = p;
+
+		if (md->phys_addr < end && efi_md_end(md) > phys_addr)
+			return 1;
+	}
+	return 0;
+}
+
 u32
 efi_mem_type (unsigned long phys_addr)
 {
@@ -766,11 +789,28 @@
 int
 valid_mmap_phys_addr_range (unsigned long pfn, unsigned long size)
 {
+	unsigned long phys_addr = pfn << PAGE_SHIFT;
+	u64 attr;
+
+	attr = efi_mem_attribute(phys_addr, size);
+
 	/*
-	 * MMIO regions are often missing from the EFI memory map.
-	 * We must allow mmap of them for programs like X, so we
-	 * currently can't do any useful validation.
+	 * /dev/mem mmap uses normal user pages, so we don't need the entire
+	 * granule, but the entire region we're mapping must support the same
+	 * attribute.
 	 */
+	if (attr & EFI_MEMORY_WB || attr & EFI_MEMORY_UC)
+		return 1;
+
+	/*
+	 * Intel firmware doesn't tell us about all the MMIO regions, so
+	 * in general we have to allow mmap requests.  But if EFI *does*
+	 * tell us about anything inside this region, we should deny it.
+	 * The user can always map a smaller region to avoid the overlap.
+	 */
+	if (efi_memmap_intersects(phys_addr, size))
+		return 0;
+
 	return 1;
 }
 
diff --git a/arch/ia64/mm/ioremap.c b/arch/ia64/mm/ioremap.c
index 4280c07..2a14062 100644
--- a/arch/ia64/mm/ioremap.c
+++ b/arch/ia64/mm/ioremap.c
@@ -1,5 +1,5 @@
 /*
- * (c) Copyright 2006 Hewlett-Packard Development Company, L.P.
+ * (c) Copyright 2006, 2007 Hewlett-Packard Development Company, L.P.
  *	Bjorn Helgaas <bjorn.helgaas@hp.com>
  *
  * This program is free software; you can redistribute it and/or modify
@@ -10,51 +10,101 @@
 #include <linux/compiler.h>
 #include <linux/module.h>
 #include <linux/efi.h>
+#include <linux/io.h>
+#include <linux/vmalloc.h>
 #include <asm/io.h>
 #include <asm/meminit.h>
 
 static inline void __iomem *
-__ioremap (unsigned long offset, unsigned long size)
+__ioremap (unsigned long phys_addr)
 {
-	return (void __iomem *) (__IA64_UNCACHED_OFFSET | offset);
+	return (void __iomem *) (__IA64_UNCACHED_OFFSET | phys_addr);
 }
 
 void __iomem *
-ioremap (unsigned long offset, unsigned long size)
+ioremap (unsigned long phys_addr, unsigned long size)
 {
+	void __iomem *addr;
+	struct vm_struct *area;
+	unsigned long offset;
+	pgprot_t prot;
 	u64 attr;
 	unsigned long gran_base, gran_size;
+	unsigned long page_base;
 
 	/*
 	 * For things in kern_memmap, we must use the same attribute
 	 * as the rest of the kernel.  For more details, see
 	 * Documentation/ia64/aliasing.txt.
 	 */
-	attr = kern_mem_attribute(offset, size);
+	attr = kern_mem_attribute(phys_addr, size);
 	if (attr & EFI_MEMORY_WB)
-		return (void __iomem *) phys_to_virt(offset);
+		return (void __iomem *) phys_to_virt(phys_addr);
 	else if (attr & EFI_MEMORY_UC)
-		return __ioremap(offset, size);
+		return __ioremap(phys_addr);
 
 	/*
 	 * Some chipsets don't support UC access to memory.  If
 	 * WB is supported for the whole granule, we prefer that.
 	 */
-	gran_base = GRANULEROUNDDOWN(offset);
-	gran_size = GRANULEROUNDUP(offset + size) - gran_base;
+	gran_base = GRANULEROUNDDOWN(phys_addr);
+	gran_size = GRANULEROUNDUP(phys_addr + size) - gran_base;
 	if (efi_mem_attribute(gran_base, gran_size) & EFI_MEMORY_WB)
-		return (void __iomem *) phys_to_virt(offset);
+		return (void __iomem *) phys_to_virt(phys_addr);
 
-	return __ioremap(offset, size);
+	/*
+	 * WB is not supported for the whole granule, so we can't use
+	 * the region 7 identity mapping.  If we can safely cover the
+	 * area with kernel page table mappings, we can use those
+	 * instead.
+	 */
+	page_base = phys_addr & PAGE_MASK;
+	size = PAGE_ALIGN(phys_addr + size) - page_base;
+	if (efi_mem_attribute(page_base, size) & EFI_MEMORY_WB) {
+		prot = PAGE_KERNEL;
+
+		/*
+		 * Mappings have to be page-aligned
+		 */
+		offset = phys_addr & ~PAGE_MASK;
+		phys_addr &= PAGE_MASK;
+
+		/*
+		 * Ok, go for it..
+		 */
+		area = get_vm_area(size, VM_IOREMAP);
+		if (!area)
+			return NULL;
+
+		area->phys_addr = phys_addr;
+		addr = (void __iomem *) area->addr;
+		if (ioremap_page_range((unsigned long) addr,
+				(unsigned long) addr + size, phys_addr, prot)) {
+			vunmap((void __force *) addr);
+			return NULL;
+		}
+
+		return (void __iomem *) (offset + (char __iomem *)addr);
+	}
+
+	return __ioremap(phys_addr);
 }
 EXPORT_SYMBOL(ioremap);
 
 void __iomem *
-ioremap_nocache (unsigned long offset, unsigned long size)
+ioremap_nocache (unsigned long phys_addr, unsigned long size)
 {
-	if (kern_mem_attribute(offset, size) & EFI_MEMORY_WB)
+	if (kern_mem_attribute(phys_addr, size) & EFI_MEMORY_WB)
 		return NULL;
 
-	return __ioremap(offset, size);
+	return __ioremap(phys_addr);
 }
 EXPORT_SYMBOL(ioremap_nocache);
+
+void
+iounmap (volatile void __iomem *addr)
+{
+	if (REGION_NUMBER(addr) == RGN_GATE)
+		vunmap((void *) ((unsigned long) addr & PAGE_MASK));
+}
+EXPORT_SYMBOL(iounmap);
diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
index 0e83f3b..9f63589 100644
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -659,8 +659,6 @@
 		return -EINVAL;
 	prot = phys_mem_access_prot(NULL, vma->vm_pgoff, size,
 				    vma->vm_page_prot);
-	if (pgprot_val(prot) != pgprot_val(pgprot_noncached(vma->vm_page_prot)))
-		return -EINVAL;
 
 	addr = pci_get_legacy_mem(bus);
 	if (IS_ERR(addr))
diff --git a/include/asm-ia64/io.h b/include/asm-ia64/io.h
index 6311e16..eb17a86 100644
--- a/include/asm-ia64/io.h
+++ b/include/asm-ia64/io.h
@@ -421,11 +421,7 @@
 
 extern void __iomem * ioremap(unsigned long offset, unsigned long size);
 extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size);
-
-static inline void
-iounmap (volatile void __iomem *addr)
-{
-}
+extern void iounmap (volatile void __iomem *addr);
 
 /* Use normal IO mappings for DMI */
 #define dmi_ioremap ioremap